diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 3964809..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "rocFFT"]
-	path = clients/rocFFT
-	url = https://github.com/ROCmSoftwarePlatform/rocFFT.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0a2f42..cc23af9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,10 @@ Documentation for hipFFT is available at
 
 ## hipFFT 1.0.14 for ROCm 6.1.0
 
+### Changes
+
+* When building hipFFT from source, rocFFT code no longer needs to be initialized as a git submodule.
+
 ### Fixes
 
 * Fixed error when creating length-1 plans.
diff --git a/README.md b/README.md
index a852205..135b4f3 100644
--- a/README.md
+++ b/README.md
@@ -61,8 +61,6 @@ To build hipFFT from source, follow these steps:
    * The clients (samples, tests, etc) included with the hipFFT source depend on FFTW, GoogleTest, and
       boost program options.
 
-   * The bench and test clients also require the rocFFT source tree (`git submodule update --init`).
-
 3. Build hipFFT:
 
     To show all build options:
diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
index 1db0d9c..b99a9e5 100644
--- a/clients/CMakeLists.txt
+++ b/clients/CMakeLists.txt
@@ -65,21 +65,6 @@ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" AND NOT CMAKE_CXX_COMPILER_ID STR
 endif()
 
 
-if( GIT_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git" )
-  message(STATUS "rocFFT submodule update")
-  execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
-    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/clients/rocFFT
-    RESULT_VARIABLE GIT_SUBMOD_RESULT)
-  if( NOT GIT_SUBMOD_RESULT EQUAL "0" )
-    message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules manually.")
-  endif( )
-endif( )
-
-if( NOT EXISTS "${CMAKE_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt" )
-  message(FATAL_ERROR "The rocFFT submodule is not present!  Please update git submodules and try again. ${CMAKE_CURRENT_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt")
-endif( )
-
-
 # This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on
 # all the time
 # This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim
diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt
index b5cef9b..ccb8c29 100644
--- a/clients/bench/CMakeLists.txt
+++ b/clients/bench/CMakeLists.txt
@@ -26,8 +26,8 @@ find_package( Boost COMPONENTS program_options REQUIRED)
 set( Boost_USE_STATIC_LIBS OFF )
 
 
-set( hipfft_bench_source bench.cpp ../rocFFT/shared/array_validator.cpp )
-set( hipfft_bench_includes bench.h ../rocFFT/shared/array_validator.h )
+set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp )
+set( hipfft_bench_includes bench.h ../../shared/array_validator.h )
 
 add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} )
 
diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
index ca60896..d2778de 100644
--- a/clients/bench/bench.cpp
+++ b/clients/bench/bench.cpp
@@ -29,7 +29,7 @@
 #include <boost/program_options.hpp>
 namespace po = boost::program_options;
 
-#include "../rocFFT/shared/gpubuf.h"
+#include "../../shared/gpubuf.h"
 
 int main(int argc, char* argv[])
 {
diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
index b8b58ac..75d9db9 100644
--- a/clients/hipfft_params.h
+++ b/clients/hipfft_params.h
@@ -23,9 +23,9 @@
 
 #include <optional>
 
+#include "../shared/fft_params.h"
 #include "hipfft/hipfft.h"
 #include "hipfft/hipfftXt.h"
-#include "rocFFT/shared/fft_params.h"
 
 inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val)
 {
diff --git a/clients/rocFFT b/clients/rocFFT
deleted file mode 160000
index d1c9113..0000000
--- a/clients/rocFFT
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d1c91135da99acd2c690e9aae619642ab57b0914
diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt
index 9742a45..2d1aac0 100644
--- a/clients/tests/CMakeLists.txt
+++ b/clients/tests/CMakeLists.txt
@@ -37,14 +37,7 @@ set( hipfft-test_source
   accuracy_test_3D.cpp
   accuracy_test_callback.cpp
   multi_device_test.cpp
-  ../rocFFT/shared/array_validator.cpp
-  )
-
-set( hipfft-test_includes
-  ../rocFFT/clients/tests/fftw_transform.h
-  ../rocFFT/clients/tests/rocfft_against_fftw.h
-  ../rocFFT/clients/tests/misc/include/test_exception.h
-  ../rocFFT/shared/array_validator.h
+  ../../shared/array_validator.cpp
   )
 
 add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} )
@@ -56,8 +49,6 @@ target_include_directories(
   $<BUILD_INTERFACE:${FFTW_INCLUDE_DIRS}>
   $<BUILD_INTERFACE:${hip_INCLUDE_DIRS}>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../library/include>
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/library/include>
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/clients/tests>
   )
 
 
diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp
index 27e849d..57d846a 100644
--- a/clients/tests/accuracy_test_1D.cpp
+++ b/clients/tests/accuracy_test_1D.cpp
@@ -23,11 +23,11 @@
 #include <stdexcept>
 #include <vector>
 
-#include "../rocFFT/shared/fft_params.h"
+#include "../../shared/fft_params.h"
 
-#include "accuracy_test.h"
-#include "fftw_transform.h"
-#include "rocfft_against_fftw.h"
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/rocfft_against_fftw.h"
 
 using ::testing::ValuesIn;
 
diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp
index 1674593..6f618c0 100644
--- a/clients/tests/accuracy_test_2D.cpp
+++ b/clients/tests/accuracy_test_2D.cpp
@@ -23,11 +23,11 @@
 #include <stdexcept>
 #include <vector>
 
-#include "../rocFFT/shared/fft_params.h"
+#include "../../shared/fft_params.h"
 
-#include "accuracy_test.h"
-#include "fftw_transform.h"
-#include "rocfft_against_fftw.h"
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/rocfft_against_fftw.h"
 
 using ::testing::ValuesIn;
 
diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp
index a87476a..941ec24 100644
--- a/clients/tests/accuracy_test_3D.cpp
+++ b/clients/tests/accuracy_test_3D.cpp
@@ -23,11 +23,11 @@
 #include <stdexcept>
 #include <vector>
 
-#include "../rocFFT/shared/fft_params.h"
+#include "../../shared/fft_params.h"
 
-#include "accuracy_test.h"
-#include "fftw_transform.h"
-#include "rocfft_against_fftw.h"
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/rocfft_against_fftw.h"
 
 using ::testing::ValuesIn;
 
diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp
index 4782830..b5cc4a7 100644
--- a/clients/tests/accuracy_test_callback.cpp
+++ b/clients/tests/accuracy_test_callback.cpp
@@ -18,7 +18,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "accuracy_test.h"
+#include "../../shared/accuracy_test.h"
 
 std::vector<std::vector<size_t>> callback_sizes = {
     // some single kernel sizes
diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
index 1f0ae83..2f7674e 100644
--- a/clients/tests/gtest_main.cpp
+++ b/clients/tests/gtest_main.cpp
@@ -30,10 +30,10 @@
 #include <streambuf>
 #include <string>
 
+#include "../../shared/concurrency.h"
+#include "../../shared/environment.h"
+#include "../../shared/work_queue.h"
 #include "../hipfft_params.h"
-#include "../rocFFT/shared/concurrency.h"
-#include "../rocFFT/shared/environment.h"
-#include "../rocFFT/shared/work_queue.h"
 #include "hipfft/hipfft.h"
 #include "hipfft_accuracy_test.h"
 #include "hipfft_test_params.h"
diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp
index 2abaf74..609239a 100644
--- a/clients/tests/hipfft_accuracy_test.cpp
+++ b/clients/tests/hipfft_accuracy_test.cpp
@@ -29,11 +29,12 @@
 #include "hipfft/hipfft.h"
 
 #include "../hipfft_params.h"
-#include "../rocFFT/clients/tests/fftw_transform.h"
-#include "../rocFFT/clients/tests/rocfft_accuracy_test.h"
-#include "../rocFFT/clients/tests/rocfft_against_fftw.h"
-#include "../rocFFT/shared/gpubuf.h"
-#include "../rocFFT/shared/rocfft_complex.h"
+
+#include "../../shared/accuracy_test.h"
+#include "../../shared/fftw_transform.h"
+#include "../../shared/gpubuf.h"
+#include "../../shared/rocfft_against_fftw.h"
+#include "../../shared/rocfft_complex.h"
 
 void fft_vs_reference(hipfft_params& params, bool round_trip)
 {
diff --git a/clients/tests/hipfft_accuracy_test.h b/clients/tests/hipfft_accuracy_test.h
index 0491bd9..181150e 100644
--- a/clients/tests/hipfft_accuracy_test.h
+++ b/clients/tests/hipfft_accuracy_test.h
@@ -23,8 +23,8 @@
 #ifndef ROCFFT_ACCURACY_TEST
 #define ROCFFT_ACCURACY_TEST
 
+#include "../../shared/accuracy_test.h"
 #include "../hipfft_params.h"
-#include "../rocFFT/clients/tests/accuracy_test.h"
 
 void fft_vs_reference(hipfft_params& params, bool round_trip = false);
 
diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
index b3dc4c9..3274b80 100644
--- a/clients/tests/multi_device_test.cpp
+++ b/clients/tests/multi_device_test.cpp
@@ -18,7 +18,7 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "accuracy_test.h"
+#include "../../shared/accuracy_test.h"
 #include <gtest/gtest.h>
 #include <hip/hip_runtime_api.h>
 
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index 5810e37..bdbf689 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -21,9 +21,6 @@
 #
 # #############################################################################
 
-# Git
-find_package(Git REQUIRED)
-
 # HIP
 if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
   if( NOT BUILD_WITH_LIB STREQUAL "CUDA" )
diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp
index 0278b88..b5e9079 100644
--- a/library/src/amd_detail/hipfft.cpp
+++ b/library/src/amd_detail/hipfft.cpp
@@ -27,10 +27,10 @@
 #include <string>
 #include <vector>
 
-#include "../../../clients/rocFFT/shared/arithmetic.h"
-#include "../../../clients/rocFFT/shared/gpubuf.h"
-#include "../../../clients/rocFFT/shared/ptrdiff.h"
-#include "../../../clients/rocFFT/shared/rocfft_hip.h"
+#include "../../../shared/arithmetic.h"
+#include "../../../shared/gpubuf.h"
+#include "../../../shared/ptrdiff.h"
+#include "../../../shared/rocfft_hip.h"
 
 #define ROC_FFT_CHECK_ALLOC_FAILED(ret)   \
     {                                     \
diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h
new file mode 100644
index 0000000..362a7c1
--- /dev/null
+++ b/shared/accuracy_test.h
@@ -0,0 +1,1949 @@
+// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#ifndef ACCURACY_TEST
+#define ACCURACY_TEST
+
+#include <algorithm>
+#include <functional>
+#include <future>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "enum_to_string.h"
+#include "fft_params.h"
+#include "fftw_transform.h"
+#include "gpubuf.h"
+#include "rocfft_against_fftw.h"
+#include "test_params.h"
+
+extern int    verbose;
+extern size_t ramgb;
+extern bool   fftw_compare;
+
+static const size_t ONE_GiB = 1 << 30;
+
+inline size_t bytes_to_GiB(const size_t bytes)
+{
+    return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB;
+}
+
+typedef std::tuple<fft_transform_type, fft_result_placement, fft_array_type, fft_array_type>
+    type_place_io_t;
+
+// Remember the results of the last FFT we computed with FFTW.  Tests
+// are ordered so that later cases can often reuse this result.
+struct last_cpu_fft_cache
+{
+    // keys to the cache
+    std::vector<size_t> length;
+    size_t              nbatch         = 0;
+    fft_transform_type  transform_type = fft_transform_type_complex_forward;
+    bool                run_callbacks  = false;
+    fft_precision       precision      = fft_precision_single;
+
+    // FFTW input/output
+    std::vector<hostbuf> cpu_input;
+    std::vector<hostbuf> cpu_output;
+};
+extern last_cpu_fft_cache last_cpu_fft_data;
+
+struct system_memory
+{
+    size_t total_bytes = 0;
+    size_t free_bytes  = 0;
+};
+extern system_memory start_memory;
+
+system_memory get_system_memory();
+
+// Estimate the amount of host memory needed for buffers.
+inline size_t needed_ram_buffers(const fft_params& params, const int verbose)
+{
+    // This calculation is assuming contiguous data but noncontiguous buffers
+    // are assumed to require a close enough amount of space for the purposes
+    // of this estimate.
+
+    size_t needed_ram = 6
+                        * std::accumulate(params.length.begin(),
+                                          params.length.end(),
+                                          static_cast<size_t>(1),
+                                          std::multiplies<size_t>());
+
+    // Account for precision and data type:
+    if(params.transform_type != fft_transform_type_real_forward
+       && params.transform_type != fft_transform_type_real_inverse)
+    {
+        needed_ram *= 2;
+    }
+    switch(params.precision)
+    {
+    case fft_precision_half:
+        needed_ram *= 2;
+        break;
+    case fft_precision_single:
+        needed_ram *= 4;
+        break;
+    case fft_precision_double:
+        needed_ram *= 8;
+        break;
+    }
+
+    needed_ram *= params.nbatch;
+
+    if(verbose)
+    {
+        std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n";
+    }
+
+    return needed_ram;
+}
+
+template <typename Tfloat>
+bool fftw_plan_uses_bluestein(const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan)
+{
+#ifdef FFTW_HAVE_SPRINT_PLAN
+    char*       print_plan_c_str = fftw_sprint_plan<Tfloat>(cpu_plan);
+    std::string print_plan(print_plan_c_str);
+    free(print_plan_c_str);
+    return print_plan.find("bluestein") != std::string::npos;
+#else
+    // assume worst case (bluestein is always used)
+    return true;
+#endif
+}
+
+// Estimate the amount of host memory needed for fftw.
+template <typename Tfloat>
+inline size_t needed_ram_fftw(const fft_params&                                  contiguous_params,
+                              const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
+                              const int                                          verbose)
+{
+    size_t total_length = std::accumulate(contiguous_params.length.begin(),
+                                          contiguous_params.length.end(),
+                                          static_cast<size_t>(1),
+                                          std::multiplies<size_t>());
+    size_t needed_ram   = 0;
+    // Detect Bluestein in plan
+    if(fftw_plan_uses_bluestein<Tfloat>(cpu_plan))
+    {
+        for(size_t dim : contiguous_params.length)
+        {
+            unsigned int needed_ram_dim = dim;
+
+            // Next-plus-one-power-of-two multiplied any other lengths
+            needed_ram_dim--;
+
+            needed_ram_dim |= needed_ram_dim >> 2;
+            needed_ram_dim |= needed_ram_dim >> 4;
+            needed_ram_dim |= needed_ram_dim >> 8;
+            needed_ram_dim |= needed_ram_dim >> 16;
+
+            needed_ram_dim++;
+
+            needed_ram_dim *= 2 * (total_length / dim);
+
+            if(needed_ram_dim > needed_ram)
+            {
+                needed_ram = needed_ram_dim;
+            }
+        }
+    }
+
+    // Account for precision and data type:
+    if(contiguous_params.transform_type != fft_transform_type_real_forward
+       && contiguous_params.transform_type != fft_transform_type_real_inverse)
+    {
+        needed_ram *= 2;
+    }
+    switch(contiguous_params.precision)
+    {
+    case fft_precision_half:
+        needed_ram *= 2;
+        break;
+    case fft_precision_single:
+        needed_ram *= 4;
+        break;
+    case fft_precision_double:
+        needed_ram *= 8;
+        break;
+    }
+
+    needed_ram *= contiguous_params.nbatch;
+
+    if(verbose)
+    {
+        std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n";
+    }
+
+    return needed_ram;
+}
+
+// Base gtest class for comparison with FFTW.
+class accuracy_test : public ::testing::TestWithParam<fft_params>
+{
+protected:
+    void SetUp() override {}
+    void TearDown() override {}
+
+public:
+    static std::string TestName(const testing::TestParamInfo<accuracy_test::ParamType>& info)
+    {
+        return info.param.token();
+    }
+};
+
+const static std::vector<size_t> batch_range = {2, 1};
+
+const static std::vector<fft_precision> precision_range_full
+    = {fft_precision_double, fft_precision_single, fft_precision_half};
+const static std::vector<fft_precision> precision_range_sp_dp
+    = {fft_precision_double, fft_precision_single};
+
+const static std::vector<fft_result_placement> place_range
+    = {fft_placement_inplace, fft_placement_notinplace};
+const static std::vector<fft_transform_type> trans_type_range
+    = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
+const static std::vector<fft_transform_type> trans_type_range_complex
+    = {fft_transform_type_complex_forward};
+const static std::vector<fft_transform_type> trans_type_range_real
+    = {fft_transform_type_real_forward};
+
+// Given a vector of vector of lengths, generate all unique permutations.
+// Add an optional vector of ad-hoc lengths to the result.
+inline std::vector<std::vector<size_t>>
+    generate_lengths(const std::vector<std::vector<size_t>>& inlengths)
+{
+    std::vector<std::vector<size_t>> output;
+    if(inlengths.size() == 0)
+    {
+        return output;
+    }
+    const size_t        dim = inlengths.size();
+    std::vector<size_t> looplength(dim);
+    for(unsigned int i = 0; i < dim; ++i)
+    {
+        looplength[i] = inlengths[i].size();
+    }
+    for(unsigned int idx = 0; idx < inlengths.size(); ++idx)
+    {
+        std::vector<size_t> index(dim);
+        do
+        {
+            std::vector<size_t> length(dim);
+            for(unsigned int i = 0; i < dim; ++i)
+            {
+                length[i] = inlengths[i][index[i]];
+            }
+            output.push_back(length);
+        } while(increment_rowmajor(index, looplength));
+    }
+    // uniquify the result
+    std::sort(output.begin(), output.end());
+    output.erase(std::unique(output.begin(), output.end()), output.end());
+    return output;
+}
+
+// Return the valid rocFFT input and output types for a given transform type.
+inline std::vector<std::pair<fft_array_type, fft_array_type>>
+    iotypes(const fft_transform_type   transformType,
+            const fft_result_placement place,
+            const bool                 planar = true)
+{
+    std::vector<std::pair<fft_array_type, fft_array_type>> iotypes;
+    switch(transformType)
+    {
+    case fft_transform_type_complex_forward:
+    case fft_transform_type_complex_inverse:
+        iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+            fft_array_type_complex_interleaved, fft_array_type_complex_interleaved));
+        if(planar)
+        {
+            iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+                fft_array_type_complex_planar, fft_array_type_complex_planar));
+            if(place == fft_placement_notinplace)
+            {
+                iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+                    fft_array_type_complex_planar, fft_array_type_complex_interleaved));
+                iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+                    fft_array_type_complex_interleaved, fft_array_type_complex_planar));
+            }
+        }
+        break;
+    case fft_transform_type_real_forward:
+        iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+            fft_array_type_real, fft_array_type_hermitian_interleaved));
+        if(planar && place == fft_placement_notinplace)
+        {
+            iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+                fft_array_type_real, fft_array_type_hermitian_planar));
+        }
+        break;
+    case fft_transform_type_real_inverse:
+        iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+            fft_array_type_hermitian_interleaved, fft_array_type_real));
+        if(planar && place == fft_placement_notinplace)
+        {
+            iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
+                fft_array_type_hermitian_planar, fft_array_type_real));
+        }
+        break;
+    default:
+        throw std::runtime_error("Invalid transform type");
+    }
+    return iotypes;
+}
+
+// Generate all combinations of input/output types, from combinations of transform and placement
+// types.
+static std::vector<type_place_io_t>
+    generate_types(fft_transform_type                       transform_type,
+                   const std::vector<fft_result_placement>& place_range,
+                   const bool                               planar)
+{
+    std::vector<type_place_io_t> ret;
+    for(auto place : place_range)
+    {
+        for(auto iotype : iotypes(transform_type, place, planar))
+        {
+            ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second));
+        }
+    }
+    return ret;
+}
+
+struct stride_generator
+{
+    struct stride_dist
+    {
+        stride_dist(const std::vector<size_t>& s, size_t d)
+            : stride(s)
+            , dist(d)
+        {
+        }
+        std::vector<size_t> stride;
+        size_t              dist;
+    };
+
+    // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer
+    //
+    // cppcheck-suppress noExplicitConstructor
+    stride_generator(const std::vector<std::vector<size_t>>& stride_list_in)
+        : stride_list(stride_list_in)
+    {
+    }
+    virtual std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
+                                              size_t                     batch) const
+    {
+        std::vector<stride_dist> ret;
+        for(const auto& s : stride_list)
+            ret.emplace_back(s, 0);
+        return ret;
+    }
+    std::vector<std::vector<size_t>> stride_list;
+};
+
+// Generate strides such that batch is essentially the innermost dimension
+// e.g. given a batch-2 4x3x2 transform which logically looks like:
+//
+// batch0:
+// A B A B
+// A B A B
+// A B A B
+//
+// A B A B
+// A B A B
+// A B A B
+//
+// batch1:
+// A B A B
+// A B A B
+// A B A B
+//
+// A B A B
+// A B A B
+// A B A B
+//
+// we instead do stride-2 4x3x2 transform where first batch is the
+// A's and second batch is the B's.
+struct stride_generator_3D_inner_batch : public stride_generator
+{
+    explicit stride_generator_3D_inner_batch(const std::vector<std::vector<size_t>>& stride_list_in)
+        : stride_generator(stride_list_in)
+    {
+    }
+    std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
+                                      size_t                     batch) const override
+    {
+        std::vector<stride_dist> ret = stride_generator::generate(lengths, batch);
+        std::vector<size_t> strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch};
+        ret.emplace_back(strides, 1);
+        return ret;
+    }
+};
+
+// Create an array of parameters to pass to gtest.  Base generator
+// that allows choosing transform type.
+inline auto param_generator_base(const std::vector<fft_transform_type>&   type_range,
+                                 const std::vector<std::vector<size_t>>&  v_lengths,
+                                 const std::vector<fft_precision>&        precision_range,
+                                 const std::vector<size_t>&               batch_range,
+                                 decltype(generate_types)                 types_generator,
+                                 const stride_generator&                  istride,
+                                 const stride_generator&                  ostride,
+                                 const std::vector<std::vector<size_t>>&  ioffset_range,
+                                 const std::vector<std::vector<size_t>>&  ooffset_range,
+                                 const std::vector<fft_result_placement>& place_range,
+                                 const bool                               planar        = true,
+                                 const bool                               run_callbacks = false)
+{
+
+    std::vector<fft_params> params;
+
+    // For any length, we compute double-precision CPU reference
+    // for largest batch size first and reuse for smaller batch
+    // sizes, then convert to single-precision.
+
+    for(auto& transform_type : type_range)
+    {
+        for(const auto& lengths : v_lengths)
+        {
+            // try to ensure that we are given literal lengths, not
+            // something to be passed to generate_lengths
+            if(lengths.empty() || lengths.size() > 3)
+            {
+                continue;
+            }
+            {
+                for(const auto precision : precision_range)
+                {
+                    for(const auto batch : batch_range)
+                    {
+                        for(const auto& types :
+                            types_generator(transform_type, place_range, planar))
+                        {
+                            for(const auto& istride_dist : istride.generate(lengths, batch))
+                            {
+                                for(const auto& ostride_dist : ostride.generate(lengths, batch))
+                                {
+                                    for(const auto& ioffset : ioffset_range)
+                                    {
+                                        for(const auto& ooffset : ooffset_range)
+                                        {
+                                            fft_params param;
+
+                                            param.length         = lengths;
+                                            param.istride        = istride_dist.stride;
+                                            param.ostride        = ostride_dist.stride;
+                                            param.nbatch         = batch;
+                                            param.precision      = precision;
+                                            param.transform_type = std::get<0>(types);
+                                            param.placement      = std::get<1>(types);
+                                            param.idist          = istride_dist.dist;
+                                            param.odist          = ostride_dist.dist;
+                                            param.itype          = std::get<2>(types);
+                                            param.otype          = std::get<3>(types);
+                                            param.ioffset        = ioffset;
+                                            param.ooffset        = ooffset;
+
+                                            if(run_callbacks)
+                                            {
+                                                // add a test if both input and output support callbacks
+                                                if(param.itype != fft_array_type_complex_planar
+                                                   && param.itype != fft_array_type_hermitian_planar
+                                                   && param.otype != fft_array_type_complex_planar
+                                                   && param.otype
+                                                          != fft_array_type_hermitian_planar)
+                                                {
+                                                    param.run_callbacks = true;
+                                                }
+                                                else
+                                                {
+                                                    continue;
+                                                }
+                                            }
+                                            param.validate();
+
+                                            // Keeping the random number generator here
+                                            // allows one to run the same tests for a given
+                                            // random seed; ie the test suite is repeatable.
+                                            std::hash<std::string>           hasher;
+                                            std::ranlux24_base               gen(random_seed
+                                                                   + hasher(param.token()));
+                                            std::uniform_real_distribution<> dis(0.0, 1.0);
+
+                                            if(param.is_planar())
+                                            {
+                                                const double roll = dis(gen);
+                                                if(roll > planar_prob)
+                                                {
+                                                    if(verbose > 4)
+                                                    {
+                                                        std::cout << "Planar transform skipped "
+                                                                     "(planar_prob: "
+                                                                  << planar_prob << " > " << roll
+                                                                  << ")\n";
+                                                    }
+                                                    continue;
+                                                }
+                                            }
+                                            if(run_callbacks)
+                                            {
+                                                const double roll = dis(gen);
+                                                if(roll > callback_prob)
+                                                {
+
+                                                    if(verbose > 4)
+                                                    {
+                                                        std::cout << "Callback transform skipped "
+                                                                     "(planar_prob: "
+                                                                  << planar_prob << " > " << roll
+                                                                  << ")\n";
+                                                    }
+                                                    continue;
+                                                }
+                                            }
+
+                                            if(param.valid(0))
+                                            {
+                                                params.push_back(param);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return params;
+}
+
+// Create an array of parameters to pass to gtest.  Default generator
+// that picks all transform types.
+inline auto param_generator(const std::vector<std::vector<size_t>>&  v_lengths,
+                            const std::vector<fft_precision>&        precision_range,
+                            const std::vector<size_t>&               batch_range,
+                            const stride_generator&                  istride,
+                            const stride_generator&                  ostride,
+                            const std::vector<std::vector<size_t>>&  ioffset_range,
+                            const std::vector<std::vector<size_t>>&  ooffset_range,
+                            const std::vector<fft_result_placement>& place_range,
+                            const bool                               planar,
+                            const bool                               run_callbacks = false)
+{
+    return param_generator_base(trans_type_range,
+                                v_lengths,
+                                precision_range,
+                                batch_range,
+                                generate_types,
+                                istride,
+                                ostride,
+                                ioffset_range,
+                                ooffset_range,
+                                place_range,
+                                planar,
+                                run_callbacks);
+}
+
+// Create an array of parameters to pass to gtest.  Only tests complex-type transforms
+inline auto param_generator_complex(const std::vector<std::vector<size_t>>&  v_lengths,
+                                    const std::vector<fft_precision>&        precision_range,
+                                    const std::vector<size_t>&               batch_range,
+                                    const stride_generator&                  istride,
+                                    const stride_generator&                  ostride,
+                                    const std::vector<std::vector<size_t>>&  ioffset_range,
+                                    const std::vector<std::vector<size_t>>&  ooffset_range,
+                                    const std::vector<fft_result_placement>& place_range,
+                                    const bool                               planar,
+                                    const bool                               run_callbacks = false)
+{
+    return param_generator_base(trans_type_range_complex,
+                                v_lengths,
+                                precision_range,
+                                batch_range,
+                                generate_types,
+                                istride,
+                                ostride,
+                                ioffset_range,
+                                ooffset_range,
+                                place_range,
+                                planar,
+                                run_callbacks);
+}
+
+// Create an array of parameters to pass to gtest.
+inline auto param_generator_real(const std::vector<std::vector<size_t>>&  v_lengths,
+                                 const std::vector<fft_precision>&        precision_range,
+                                 const std::vector<size_t>&               batch_range,
+                                 const stride_generator&                  istride,
+                                 const stride_generator&                  ostride,
+                                 const std::vector<std::vector<size_t>>&  ioffset_range,
+                                 const std::vector<std::vector<size_t>>&  ooffset_range,
+                                 const std::vector<fft_result_placement>& place_range,
+                                 const bool                               planar,
+                                 const bool                               run_callbacks = false)
+{
+    return param_generator_base(trans_type_range_real,
+                                v_lengths,
+                                precision_range,
+                                batch_range,
+                                generate_types,
+                                istride,
+                                ostride,
+                                ioffset_range,
+                                ooffset_range,
+                                place_range,
+                                planar,
+                                run_callbacks);
+}
+
+template <class Tcontainer>
+auto param_generator_token(const Tcontainer& tokens)
+{
+    std::vector<fft_params> params;
+    params.reserve(tokens.size());
+    for(auto t : tokens)
+    {
+        params.push_back({});
+        params.back().from_token(t);
+    }
+    return params;
+}
+
+struct callback_test_data
+{
+    // scalar to modify the input/output with
+    double scalar;
+    // base address of input, to ensure that each callback gets an offset from that base
+    void* base;
+};
+
+void* get_load_callback_host(fft_array_type itype,
+                             fft_precision  precision,
+                             bool           round_trip_inverse);
+void  apply_load_callback(const fft_params& params, std::vector<hostbuf>& input);
+void  apply_store_callback(const fft_params& params, std::vector<hostbuf>& output);
+void* get_store_callback_host(fft_array_type otype,
+                              fft_precision  precision,
+                              bool           round_trip_inverse);
+
+static auto allocate_cpu_fft_buffer(const fft_precision        precision,
+                                    const fft_array_type       type,
+                                    const std::vector<size_t>& size)
+{
+    // FFTW does not support half-precision, so we do single instead.
+    // So if we need to do a half-precision FFTW transform, allocate
+    // enough buffer for single-precision instead.
+    return allocate_host_buffer(
+        precision == fft_precision_half ? fft_precision_single : precision, type, size);
+}
+
+template <typename Tfloat>
+inline void execute_cpu_fft(fft_params&                                  params,
+                            fft_params&                                  contiguous_params,
+                            typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
+                            std::vector<hostbuf>&                        cpu_input,
+                            std::vector<hostbuf>&                        cpu_output)
+{
+    // CPU output might not be allocated already for us, if FFTW never
+    // needed an output buffer during planning
+    if(cpu_output.empty())
+        cpu_output = allocate_cpu_fft_buffer(
+            contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
+
+    // If this is either C2R or callbacks are enabled, the
+    // input will be modified.  So we need to modify the copy instead.
+    std::vector<hostbuf>  cpu_input_copy(cpu_input.size());
+    std::vector<hostbuf>* input_ptr = &cpu_input;
+    if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse)
+    {
+        for(size_t i = 0; i < cpu_input.size(); ++i)
+        {
+            cpu_input_copy[i] = cpu_input[i].copy();
+        }
+
+        input_ptr = &cpu_input_copy;
+    }
+
+    // run FFTW (which may destroy CPU input)
+    apply_load_callback(params, *input_ptr);
+    fftw_run<Tfloat>(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output);
+    // clean up
+    fftw_destroy_plan_type(cpu_plan);
+    // ask FFTW to fully clean up, since it tries to cache plan details
+    fftw_cleanup();
+    cpu_plan = nullptr;
+    apply_store_callback(params, cpu_output);
+}
+
+// execute the GPU transform
+template <class Tparams>
+inline void execute_gpu_fft(Tparams&              params,
+                            std::vector<void*>&   pibuffer,
+                            std::vector<void*>&   pobuffer,
+                            std::vector<gpubuf>&  obuffer,
+                            std::vector<hostbuf>& gpu_output,
+                            bool                  round_trip_inverse = false)
+{
+    gpubuf_t<callback_test_data> load_cb_data_dev;
+    gpubuf_t<callback_test_data> store_cb_data_dev;
+    if(params.run_callbacks)
+    {
+        void* load_cb_host
+            = get_load_callback_host(params.itype, params.precision, round_trip_inverse);
+
+        callback_test_data load_cb_data_host;
+
+        if(round_trip_inverse)
+        {
+            load_cb_data_host.scalar = params.store_cb_scalar;
+        }
+        else
+        {
+            load_cb_data_host.scalar = params.load_cb_scalar;
+        }
+
+        load_cb_data_host.base = pibuffer.front();
+
+        auto hip_status = hipSuccess;
+
+        hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data));
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
+        hip_status = hipMemcpy(load_cb_data_dev.data(),
+                               &load_cb_data_host,
+                               sizeof(callback_test_data),
+                               hipMemcpyHostToDevice);
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
+
+        void* store_cb_host
+            = get_store_callback_host(params.otype, params.precision, round_trip_inverse);
+
+        callback_test_data store_cb_data_host;
+
+        if(round_trip_inverse)
+        {
+            store_cb_data_host.scalar = params.load_cb_scalar;
+        }
+        else
+        {
+            store_cb_data_host.scalar = params.store_cb_scalar;
+        }
+
+        store_cb_data_host.base = pobuffer.front();
+
+        hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data));
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
+
+        hip_status = hipMemcpy(store_cb_data_dev.data(),
+                               &store_cb_data_host,
+                               sizeof(callback_test_data),
+                               hipMemcpyHostToDevice);
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP();
+            }
+            else
+            {
+                GTEST_FAIL();
+            }
+        }
+
+        auto fft_status = params.set_callbacks(
+            load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data());
+        if(fft_status != fft_status_success)
+            throw std::runtime_error("set callback failure");
+    }
+
+    // Execute the transform:
+    auto fft_status = params.execute(pibuffer.data(), pobuffer.data());
+    if(fft_status != fft_status_success)
+        throw std::runtime_error("rocFFT plan execution failure");
+
+    // if not comparing, then just executing the GPU FFT is all we
+    // need to do
+    if(!fftw_compare)
+        return;
+
+    // finalize a multi-GPU transform
+    params.multi_gpu_finalize(obuffer, pobuffer);
+
+    ASSERT_TRUE(!gpu_output.empty()) << "no output buffers";
+    for(unsigned int idx = 0; idx < gpu_output.size(); ++idx)
+    {
+        ASSERT_TRUE(gpu_output[idx].data() != nullptr)
+            << "output buffer index " << idx << " is empty";
+        auto hip_status = hipMemcpy(gpu_output[idx].data(),
+                                    pobuffer.at(idx),
+                                    gpu_output[idx].size(),
+                                    hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+        {
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP() << "hipMemcpy failure";
+            }
+            else
+            {
+                GTEST_FAIL() << "hipMemcpy failure";
+            }
+        }
+    }
+    if(verbose > 2)
+    {
+        std::cout << "GPU output:\n";
+        params.print_obuffer(gpu_output);
+    }
+    if(verbose > 5)
+    {
+        std::cout << "flat GPU output:\n";
+        params.print_obuffer_flat(gpu_output);
+    }
+}
+
+template <typename Tfloat>
+static void assert_init_value(const std::vector<hostbuf>& output,
+                              const size_t                idx,
+                              const Tfloat                orig_value);
+
+template <>
+void assert_init_value(const std::vector<hostbuf>& output, const size_t idx, const float orig_value)
+{
+    float actual_value = reinterpret_cast<const float*>(output.front().data())[idx];
+    ASSERT_EQ(actual_value, orig_value) << "index " << idx;
+}
+
+template <>
+void assert_init_value(const std::vector<hostbuf>& output,
+                       const size_t                idx,
+                       const double                orig_value)
+{
+    double actual_value = reinterpret_cast<const double*>(output.front().data())[idx];
+    ASSERT_EQ(actual_value, orig_value) << "index " << idx;
+}
+
+template <>
+void assert_init_value(const std::vector<hostbuf>& output,
+                       const size_t                idx,
+                       const rocfft_complex<float> orig_value)
+{
+    // if this is interleaved, check directly
+    if(output.size() == 1)
+    {
+        rocfft_complex<float> actual_value
+            = reinterpret_cast<const rocfft_complex<float>*>(output.front().data())[idx];
+        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+    }
+    else
+    {
+        // planar
+        rocfft_complex<float> actual_value{
+            reinterpret_cast<const float*>(output.front().data())[idx],
+            reinterpret_cast<const float*>(output.back().data())[idx]};
+        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+    }
+}
+
+template <>
+void assert_init_value(const std::vector<hostbuf>&  output,
+                       const size_t                 idx,
+                       const rocfft_complex<double> orig_value)
+{
+    // if this is interleaved, check directly
+    if(output.size() == 1)
+    {
+        rocfft_complex<double> actual_value
+            = reinterpret_cast<const rocfft_complex<double>*>(output.front().data())[idx];
+        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+    }
+    else
+    {
+        // planar
+        rocfft_complex<double> actual_value{
+            reinterpret_cast<const double*>(output.front().data())[idx],
+            reinterpret_cast<const double*>(output.back().data())[idx]};
+        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
+        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
+    }
+}
+
+static const int OUTPUT_INIT_PATTERN = 0xcd;
+template <class Tfloat>
+void check_single_output_stride(const std::vector<hostbuf>& output,
+                                const size_t                offset,
+                                const std::vector<size_t>&  length,
+                                const std::vector<size_t>&  stride,
+                                const size_t                i)
+{
+    Tfloat orig;
+    memset(static_cast<void*>(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat));
+
+    size_t curLength         = length[i];
+    size_t curStride         = stride[i];
+    size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1];
+    size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1];
+
+    if(nextSmallerLength == 0)
+    {
+        // this is the fastest dim, indexes that are not multiples of
+        // the stride should be the initial value
+        for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx)
+        {
+            if(idx % curStride != 0)
+                assert_init_value<Tfloat>(output, idx, orig);
+        }
+    }
+    else
+    {
+        for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx)
+        {
+            // check that the space after the next smaller dim and the
+            // end of this dim is initial value
+            for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx)
+                assert_init_value<Tfloat>(output, idx, orig);
+
+            check_single_output_stride<Tfloat>(
+                output, offset + lengthIdx * curStride, length, stride, i + 1);
+        }
+    }
+}
+
+template <class Tparams>
+void check_output_strides(const std::vector<hostbuf>& output, Tparams& params)
+{
+    // treat batch+dist like highest length+stride, if batch > 1
+    std::vector<size_t> length;
+    std::vector<size_t> stride;
+    if(params.nbatch > 1)
+    {
+        length.push_back(params.nbatch);
+        stride.push_back(params.odist);
+    }
+
+    auto olength = params.olength();
+    std::copy(olength.begin(), olength.end(), std::back_inserter(length));
+    std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride));
+
+    if(params.precision == fft_precision_single)
+    {
+        if(params.otype == fft_array_type_real)
+            check_single_output_stride<float>(output, 0, length, stride, 0);
+        else
+            check_single_output_stride<rocfft_complex<float>>(output, 0, length, stride, 0);
+    }
+    else
+    {
+        if(params.otype == fft_array_type_real)
+            check_single_output_stride<double>(output, 0, length, stride, 0);
+        else
+            check_single_output_stride<rocfft_complex<double>>(output, 0, length, stride, 0);
+    }
+}
+
+// run rocFFT inverse transform
+template <class Tparams>
+inline void run_round_trip_inverse(Tparams&              params,
+                                   std::vector<gpubuf>&  obuffer,
+                                   std::vector<void*>&   pibuffer,
+                                   std::vector<void*>&   pobuffer,
+                                   std::vector<hostbuf>& gpu_output)
+{
+    params.validate();
+
+    // Make sure that the parameters make sense:
+    ASSERT_TRUE(params.valid(verbose));
+
+    // Create FFT plan - this will also allocate work buffer, but will throw a
+    // specific exception if that step fails
+    auto plan_status = fft_status_success;
+    try
+    {
+        plan_status = params.create_plan();
+    }
+    catch(fft_params::work_buffer_alloc_failure& e)
+    {
+        std::stringstream ss;
+        ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")";
+        ++n_hip_failures;
+        if(skip_runtime_fails)
+        {
+            GTEST_SKIP() << ss.str();
+        }
+        else
+        {
+            GTEST_FAIL() << ss.str();
+        }
+    }
+    ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed";
+
+    auto obuffer_sizes = params.obuffer_sizes();
+
+    if(params.placement != fft_placement_inplace)
+    {
+        for(unsigned int i = 0; i < obuffer_sizes.size(); ++i)
+        {
+            // If we're validating output strides, init the
+            // output buffer to a known pattern and we can check
+            // that the pattern is untouched in places that
+            // shouldn't have been touched.
+            if(params.check_output_strides)
+            {
+                auto hip_status
+                    = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemset failure";
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemset failure";
+                    }
+                }
+            }
+        }
+    }
+
+    // execute GPU transform
+    execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true);
+}
+
+// compare rocFFT inverse transform with forward transform input
+template <class Tparams>
+inline void compare_round_trip_inverse(Tparams&              params,
+                                       fft_params&           contiguous_params,
+                                       std::vector<hostbuf>& gpu_output,
+                                       std::vector<hostbuf>& cpu_input,
+                                       const VectorNorms&    cpu_input_norm,
+                                       size_t                total_length)
+{
+    if(params.check_output_strides)
+    {
+        check_output_strides<Tparams>(gpu_output, params);
+    }
+
+    // compute GPU output norm
+    std::shared_future<VectorNorms> gpu_norm = std::async(std::launch::async, [&]() {
+        return norm(gpu_output,
+                    params.olength(),
+                    params.nbatch,
+                    params.precision,
+                    params.otype,
+                    params.ostride,
+                    params.odist,
+                    params.ooffset);
+    });
+
+    // compare GPU inverse output to CPU forward input
+    std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
+    if(verbose > 1)
+        linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
+    const double linf_cutoff
+        = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length);
+
+    VectorNorms diff = distance(cpu_input,
+                                gpu_output,
+                                params.olength(),
+                                params.nbatch,
+                                params.precision,
+                                contiguous_params.itype,
+                                contiguous_params.istride,
+                                contiguous_params.idist,
+                                params.otype,
+                                params.ostride,
+                                params.odist,
+                                linf_failures.get(),
+                                linf_cutoff,
+                                {0},
+                                params.ooffset,
+                                1.0 / total_length);
+
+    if(verbose > 1)
+    {
+        std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
+        std::cout << "GPU output L2 norm:   " << gpu_norm.get().l_2 << "\n";
+        std::cout << "GPU linf norm failures:";
+        std::sort(linf_failures->begin(), linf_failures->end());
+        for(const auto& i : *linf_failures)
+        {
+            std::cout << " (" << i.first << "," << i.second << ")";
+        }
+        std::cout << std::endl;
+    }
+
+    EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
+    EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
+
+    switch(params.precision)
+    {
+    case fft_precision_half:
+        max_linf_eps_half
+            = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+        max_l2_eps_half
+            = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    case fft_precision_single:
+        max_linf_eps_single
+            = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+        max_l2_eps_single
+            = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    case fft_precision_double:
+        max_linf_eps_double
+            = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
+        max_l2_eps_double
+            = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    }
+
+    if(verbose > 1)
+    {
+        std::cout << "L2 diff: " << diff.l_2 << "\n";
+        std::cout << "Linf diff: " << diff.l_inf << "\n";
+    }
+
+    EXPECT_TRUE(diff.l_inf <= linf_cutoff)
+        << "Linf test failed.  Linf:" << diff.l_inf
+        << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff
+        << params.str();
+
+    EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2
+                < sqrt(log2(total_length)) * type_epsilon(params.precision))
+        << "L2 test failed. L2: " << diff.l_2
+        << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2
+        << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
+        << params.str();
+}
+
+// RAII type to put data into the cache when this object leaves scope
+struct StoreCPUDataToCache
+{
+    StoreCPUDataToCache(std::vector<hostbuf>& cpu_input, std::vector<hostbuf>& cpu_output)
+        : cpu_input(cpu_input)
+        , cpu_output(cpu_output)
+    {
+    }
+    ~StoreCPUDataToCache()
+    {
+        last_cpu_fft_data.cpu_output.swap(cpu_output);
+        last_cpu_fft_data.cpu_input.swap(cpu_input);
+    }
+    std::vector<hostbuf>& cpu_input;
+    std::vector<hostbuf>& cpu_output;
+};
+
+// run CPU + rocFFT transform with the given params and compare
+template <class Tfloat, class Tparams>
+inline void fft_vs_reference_impl(Tparams& params, bool round_trip)
+{
+    // Call hipGetLastError to reset any errors
+    // returned by previous HIP runtime API calls.
+    hipError_t hip_status = hipGetLastError();
+
+    // Make sure that the parameters make sense:
+    ASSERT_TRUE(params.valid(verbose));
+
+    size_t needed_ram = needed_ram_buffers(params, verbose);
+
+    if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
+    {
+        GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb
+                     << ".\n";
+    }
+
+    auto ibuffer_sizes = params.ibuffer_sizes();
+    auto obuffer_sizes = params.obuffer_sizes();
+
+    size_t vram_avail = 0;
+
+    if(vramgb == 0)
+    {
+        // Check free and total available memory:
+        size_t free       = 0;
+        size_t total      = 0;
+        auto   hip_status = hipMemGetInfo(&free, &total);
+        if(hip_status != hipSuccess || total == 0)
+        {
+            ++n_hip_failures;
+            std::stringstream ss;
+            if(total == 0)
+                ss << "hipMemGetInfo claims there there isn't any vram";
+            else
+                ss << "hipMemGetInfo failure with error " << hip_status;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP() << ss.str();
+            }
+            else
+            {
+                GTEST_FAIL() << ss.str();
+            }
+        }
+        vram_avail = total;
+    }
+    else
+    {
+        vram_avail = vramgb * ONE_GiB;
+    }
+
+    // First try a quick estimation of vram footprint, to speed up skipping tests
+    // that are too large to fit in the gpu (no plan created with the rocFFT backend)
+    const auto raw_vram_footprint
+        = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
+
+    if(!vram_fits_problem(raw_vram_footprint, vram_avail))
+    {
+        GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint)
+                     << " GiB) raw data too large for device";
+    }
+
+    if(verbose > 2)
+    {
+        std::cout << "Raw problem size: " << raw_vram_footprint << std::endl;
+    }
+
+    // If it passed the quick estimation test, go for the more
+    // accurate calculation that actually creates the plan and
+    // take into account the work buffer size
+    const auto vram_footprint = params.vram_footprint();
+    if(!vram_fits_problem(vram_footprint, vram_avail))
+    {
+        if(verbose)
+        {
+            std::cout << "Problem raw data won't fit on device; skipped." << std::endl;
+        }
+        GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint)
+                     << " GiB) raw data too large for device";
+    }
+
+    // Create FFT plan - this will also allocate work buffer, but
+    // will throw a specific exception if that step fails
+    auto plan_status = fft_status_success;
+    try
+    {
+        plan_status = params.create_plan();
+    }
+    catch(fft_params::work_buffer_alloc_failure& e)
+    {
+        ++n_hip_failures;
+        std::stringstream ss;
+        ss << "Work buffer allocation failed with size: " << params.workbuffersize;
+        if(skip_runtime_fails)
+        {
+            GTEST_SKIP() << ss.str();
+        }
+        else
+        {
+            GTEST_FAIL() << ss.str();
+        }
+    }
+    ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed";
+
+    if(!vram_fits_problem(vram_footprint, vram_avail))
+    {
+        if(verbose)
+        {
+            std::cout << "Problem won't fit on device; skipped." << std::endl;
+        }
+        GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device";
+        return;
+    }
+
+    fft_params contiguous_params;
+    contiguous_params.length         = params.length;
+    contiguous_params.precision      = params.precision;
+    contiguous_params.placement      = fft_placement_notinplace;
+    contiguous_params.transform_type = params.transform_type;
+    contiguous_params.nbatch         = params.nbatch;
+    contiguous_params.itype          = contiguous_itype(params.transform_type);
+    contiguous_params.otype          = contiguous_otype(contiguous_params.transform_type);
+
+    contiguous_params.validate();
+
+    if(!contiguous_params.valid(verbose))
+    {
+        throw std::runtime_error("Invalid contiguous params");
+    }
+
+    if(verbose > 3)
+    {
+        std::cout << "CPU params:\n";
+        std::cout << contiguous_params.str("\n\t") << std::endl;
+    }
+
+    std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
+    std::vector<void*>  pibuffer(ibuffer_sizes.size());
+    for(unsigned int i = 0; i < ibuffer.size(); ++i)
+    {
+        hip_status = ibuffer[i].alloc(ibuffer_sizes[i]);
+        if(hip_status != hipSuccess)
+        {
+            std::stringstream ss;
+            ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "("
+               << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)"
+               << " with code " << hipError_to_string(hip_status);
+            ++n_hip_failures;
+            if(skip_runtime_fails)
+            {
+                GTEST_SKIP() << ss.str();
+            }
+            else
+            {
+                GTEST_FAIL() << ss.str();
+            }
+        }
+        pibuffer[i] = ibuffer[i].data();
+    }
+
+    // allocation counts in elements, ibuffer_sizes is in bytes
+    auto ibuffer_sizes_elems = ibuffer_sizes;
+    for(auto& buf : ibuffer_sizes_elems)
+        buf /= var_size<size_t>(params.precision, params.itype);
+
+    // Check cache first - nbatch is a >= comparison because we compute
+    // the largest batch size and cache it.  Smaller batch runs can
+    // compare against the larger data.
+    std::vector<hostbuf>                 cpu_input;
+    std::vector<hostbuf>                 cpu_output;
+    std::shared_future<void>             convert_cpu_output_precision;
+    std::shared_future<void>             convert_cpu_input_precision;
+    bool                                 run_fftw = true;
+    std::unique_ptr<StoreCPUDataToCache> store_to_cache;
+    if(fftw_compare && last_cpu_fft_data.length == params.length
+       && last_cpu_fft_data.transform_type == params.transform_type
+       && last_cpu_fft_data.run_callbacks == params.run_callbacks)
+    {
+        if(last_cpu_fft_data.nbatch >= params.nbatch)
+        {
+            // use the cached input/output
+            cpu_input.swap(last_cpu_fft_data.cpu_input);
+            cpu_output.swap(last_cpu_fft_data.cpu_output);
+            run_fftw = false;
+
+            store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
+
+            if(params.precision != last_cpu_fft_data.precision)
+            {
+                // Tests should be ordered so we do wider first, then narrower.
+                switch(params.precision)
+                {
+                case fft_precision_double:
+                    std::cerr
+                        << "test ordering is incorrect: double precision follows a narrower one"
+                        << std::endl;
+                    abort();
+                    break;
+                case fft_precision_single:
+                    if(last_cpu_fft_data.precision != fft_precision_double)
+                    {
+                        std::cerr
+                            << "test ordering is incorrect: float precision follows a narrower one"
+                            << std::endl;
+                        abort();
+                    }
+                    // convert the input/output to single-precision
+                    convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+                        narrow_precision_inplace<double, float>(cpu_output.front());
+                    });
+                    convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
+                        narrow_precision_inplace<double, float>(cpu_input.front());
+                    });
+                    break;
+                case fft_precision_half:
+                    // convert to half precision
+                    if(last_cpu_fft_data.precision == fft_precision_double)
+                    {
+                        convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<double, _Float16>(cpu_output.front());
+                        });
+                        convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<double, _Float16>(cpu_input.front());
+                        });
+                    }
+                    else if(last_cpu_fft_data.precision == fft_precision_single)
+                    {
+                        convert_cpu_output_precision = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<float, _Float16>(cpu_output.front());
+                        });
+                        convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
+                            narrow_precision_inplace<float, _Float16>(cpu_input.front());
+                        });
+                    }
+                    else
+                    {
+                        std::cerr << "unhandled previous precision, cannot convert to half"
+                                  << std::endl;
+                        abort();
+                    }
+                    break;
+                }
+                last_cpu_fft_data.precision = params.precision;
+            }
+        }
+        // If the last result has a smaller batch than the new
+        // params, that might be a developer error - tests should be
+        // ordered to generate the bigger batch first.  But if tests
+        // got filtered or skipped due to insufficient memory, we
+        // might never have tried to generate the bigger batch first.
+        // So just fall through and redo the CPU FFT.
+    }
+    else
+    {
+        // Clear cache explicitly so that even if we didn't get a hit,
+        // we're not uselessly holding on to cached cpu input/output
+        last_cpu_fft_data = last_cpu_fft_cache();
+    }
+
+    // Allocate CPU input
+    if(run_fftw)
+    {
+        cpu_input = allocate_cpu_fft_buffer(
+            contiguous_params.precision, contiguous_params.itype, contiguous_params.isize);
+    }
+
+    // Create FFTW plan - this may write to input, but that's fine
+    // since there's nothing in there right now
+    typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan = nullptr;
+    if(run_fftw)
+    {
+        // Normally, we would want to defer allocation of CPU output
+        // buffer until when we actually do the CPU FFT.  But if we're
+        // using FFTW wisdom, FFTW needs an output buffer at plan
+        // creation time.
+        if(use_fftw_wisdom)
+        {
+            cpu_output = allocate_cpu_fft_buffer(
+                contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
+        }
+        cpu_plan = fftw_plan_via_rocfft<Tfloat>(contiguous_params.length,
+                                                contiguous_params.istride,
+                                                contiguous_params.ostride,
+                                                contiguous_params.nbatch,
+                                                contiguous_params.idist,
+                                                contiguous_params.odist,
+                                                contiguous_params.transform_type,
+                                                cpu_input,
+                                                cpu_output);
+
+        needed_ram += needed_ram_fftw<Tfloat>(contiguous_params, cpu_plan, verbose);
+
+        if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
+        {
+            if(verbose)
+            {
+                std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]."
+                          << std::endl;
+            }
+            GTEST_SKIP();
+            return;
+        }
+    }
+
+    std::vector<hostbuf> gpu_input_data;
+
+    // allocate and populate the input buffer (cpu/gpu)
+    if(run_fftw)
+    {
+        gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
+
+        //generate the input directly on the gpu
+        params.compute_input(ibuffer);
+
+        // Copy the input to CPU
+        if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
+           || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
+        {
+            // Copy input to CPU
+            for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
+            {
+                hip_status = hipMemcpy(gpu_input_data.at(idx).data(),
+                                       ibuffer[idx].data(),
+                                       ibuffer_sizes[idx],
+                                       hipMemcpyDeviceToHost);
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+                    }
+                }
+            }
+
+            copy_buffers(gpu_input_data,
+                         cpu_input,
+                         params.ilength(),
+                         params.nbatch,
+                         params.precision,
+                         params.itype,
+                         params.istride,
+                         params.idist,
+                         contiguous_params.itype,
+                         contiguous_params.istride,
+                         contiguous_params.idist,
+                         params.ioffset,
+                         contiguous_params.ioffset);
+        }
+        else
+        {
+            // Copy input to CPU
+            for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
+            {
+                hip_status = hipMemcpy(cpu_input.at(idx).data(),
+                                       ibuffer[idx].data(),
+                                       ibuffer_sizes[idx],
+                                       hipMemcpyDeviceToHost);
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+                    }
+                }
+            }
+        }
+    }
+    else if(fftw_compare)
+    {
+        gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
+
+        // In case the cached cpu input needed conversion, wait for it
+        if(convert_cpu_input_precision.valid())
+            convert_cpu_input_precision.get();
+
+        // gets a pre-computed gpu input buffer from the cpu cache
+        std::vector<hostbuf>* gpu_input = &cpu_input;
+
+        if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
+           || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
+        {
+            copy_buffers(cpu_input,
+                         gpu_input_data,
+                         params.ilength(),
+                         params.nbatch,
+                         params.precision,
+                         contiguous_params.itype,
+                         contiguous_params.istride,
+                         contiguous_params.idist,
+                         params.itype,
+                         params.istride,
+                         params.idist,
+                         {0},
+                         params.ioffset);
+            gpu_input = &gpu_input_data;
+        }
+
+        // Copy input to GPU
+        for(unsigned int idx = 0; idx < gpu_input->size(); ++idx)
+        {
+            hip_status = hipMemcpy(ibuffer[idx].data(),
+                                   gpu_input->at(idx).data(),
+                                   ibuffer_sizes[idx],
+                                   hipMemcpyHostToDevice);
+
+            if(hip_status != hipSuccess)
+            {
+                ++n_hip_failures;
+                if(skip_runtime_fails)
+                {
+                    GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
+                }
+                else
+                {
+                    GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
+                }
+            }
+        }
+    }
+
+    if(verbose > 3)
+    {
+        std::cout << "CPU input:\n";
+        contiguous_params.print_ibuffer(cpu_input);
+    }
+
+    // compute input norm
+    std::shared_future<VectorNorms> cpu_input_norm;
+    if(fftw_compare)
+        cpu_input_norm = std::async(std::launch::async, [&]() {
+            // in case the cached cpu input needed conversion, wait for it
+            if(convert_cpu_input_precision.valid())
+                convert_cpu_input_precision.get();
+
+            auto input_norm = norm(cpu_input,
+                                   contiguous_params.ilength(),
+                                   contiguous_params.nbatch,
+                                   contiguous_params.precision,
+                                   contiguous_params.itype,
+                                   contiguous_params.istride,
+                                   contiguous_params.idist,
+                                   contiguous_params.ioffset);
+            if(verbose > 2)
+            {
+                std::cout << "CPU Input Linf norm:  " << input_norm.l_inf << "\n";
+                std::cout << "CPU Input L2 norm:    " << input_norm.l_2 << "\n";
+            }
+            return input_norm;
+        });
+
+    std::vector<gpubuf>  obuffer_data;
+    std::vector<gpubuf>* obuffer = &obuffer_data;
+    std::vector<void*>   pobuffer;
+
+    // allocate the output buffer
+
+    if(params.placement == fft_placement_inplace)
+    {
+        obuffer = &ibuffer;
+    }
+    else
+    {
+        auto obuffer_sizes = params.obuffer_sizes();
+        obuffer_data.resize(obuffer_sizes.size());
+        for(unsigned int i = 0; i < obuffer_data.size(); ++i)
+        {
+            hip_status = obuffer_data[i].alloc(obuffer_sizes[i]);
+            if(hip_status != hipSuccess)
+            {
+                ++n_hip_failures;
+                std::stringstream ss;
+                ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i]
+                   << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)"
+                   << " with code " << hipError_to_string(hip_status);
+                if(skip_runtime_fails)
+                {
+                    GTEST_SKIP() << ss.str();
+                }
+                else
+                {
+                    GTEST_FAIL() << ss.str();
+                }
+            }
+
+            // If we're validating output strides, init the
+            // output buffer to a known pattern and we can check
+            // that the pattern is untouched in places that
+            // shouldn't have been touched.
+            if(params.check_output_strides)
+            {
+                hip_status
+                    = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
+                if(hip_status != hipSuccess)
+                {
+                    ++n_hip_failures;
+                    if(skip_runtime_fails)
+                    {
+                        GTEST_SKIP() << "hipMemset failure with error " << hip_status;
+                    }
+                    else
+                    {
+                        GTEST_FAIL() << "hipMemset failure with error " << hip_status;
+                    }
+                }
+            }
+        }
+    }
+    pobuffer.resize(obuffer->size());
+    for(unsigned int i = 0; i < obuffer->size(); ++i)
+    {
+        pobuffer[i] = obuffer->at(i).data();
+    }
+
+    // Run CPU transform
+    //
+    // NOTE: This must happen after input is copied to GPU and input
+    // norm is computed, since the CPU FFT may overwrite the input.
+    VectorNorms              cpu_output_norm;
+    std::shared_future<void> cpu_fft;
+    if(fftw_compare)
+        cpu_fft = std::async(std::launch::async, [&]() {
+            // wait for input norm to finish, since we might overwrite input
+            cpu_input_norm.get();
+
+            if(run_fftw)
+                execute_cpu_fft<Tfloat>(params, contiguous_params, cpu_plan, cpu_input, cpu_output);
+            // in case the cached cpu output needed conversion, wait for it
+            else if(convert_cpu_output_precision.valid())
+                convert_cpu_output_precision.get();
+
+            if(verbose > 3)
+            {
+                std::cout << "CPU output:\n";
+                contiguous_params.print_obuffer(cpu_output);
+            }
+
+            cpu_output_norm = norm(cpu_output,
+                                   params.olength(),
+                                   params.nbatch,
+                                   params.precision,
+                                   contiguous_params.otype,
+                                   contiguous_params.ostride,
+                                   contiguous_params.odist,
+                                   contiguous_params.ooffset);
+            if(verbose > 2)
+            {
+                std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n";
+                std::cout << "CPU Output L2 norm:   " << cpu_output_norm.l_2 << "\n";
+            }
+        });
+
+    // scatter data out to multi-GPUs if this is a multi-GPU test
+    params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
+
+    // execute GPU transform
+    std::vector<hostbuf> gpu_output
+        = allocate_host_buffer(params.precision, params.otype, params.osize);
+
+    execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output);
+
+    params.free();
+
+    if(params.check_output_strides)
+    {
+        check_output_strides<Tparams>(gpu_output, params);
+    }
+
+    // compute GPU output norm
+    std::shared_future<VectorNorms> gpu_norm;
+    if(fftw_compare)
+        gpu_norm = std::async(std::launch::async, [&]() {
+            return norm(gpu_output,
+                        params.olength(),
+                        params.nbatch,
+                        params.precision,
+                        params.otype,
+                        params.ostride,
+                        params.odist,
+                        params.ooffset);
+        });
+
+    // compare output
+    //
+    // Compute the l-infinity and l-2 distance between the CPU and GPU output:
+    // wait for cpu FFT so we can compute cutoff
+
+    const auto total_length = std::accumulate(params.length.begin(),
+                                              params.length.end(),
+                                              static_cast<size_t>(1),
+                                              std::multiplies<size_t>());
+
+    std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
+    if(verbose > 1)
+        linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
+    double      linf_cutoff;
+    VectorNorms diff;
+
+    std::shared_future<void> compare_output;
+    if(fftw_compare)
+        compare_output = std::async(std::launch::async, [&]() {
+            cpu_fft.get();
+            linf_cutoff
+                = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length);
+
+            diff = distance(cpu_output,
+                            gpu_output,
+                            params.olength(),
+                            params.nbatch,
+                            params.precision,
+                            contiguous_params.otype,
+                            contiguous_params.ostride,
+                            contiguous_params.odist,
+                            params.otype,
+                            params.ostride,
+                            params.odist,
+                            linf_failures.get(),
+                            linf_cutoff,
+                            {0},
+                            params.ooffset);
+        });
+
+    // Update the cache if this current transform is different from
+    // what's stored.  But if this transform only has a smaller batch
+    // than what's cached, we can still keep the cache around since
+    // the input/output we already have is still valid.
+    const bool update_last_cpu_fft_data
+        = last_cpu_fft_data.length != params.length
+          || last_cpu_fft_data.transform_type != params.transform_type
+          || last_cpu_fft_data.run_callbacks != params.run_callbacks
+          || last_cpu_fft_data.precision != params.precision
+          || params.nbatch > last_cpu_fft_data.nbatch;
+
+    // store cpu output in cache
+    if(update_last_cpu_fft_data)
+    {
+        last_cpu_fft_data.length         = params.length;
+        last_cpu_fft_data.nbatch         = params.nbatch;
+        last_cpu_fft_data.transform_type = params.transform_type;
+        last_cpu_fft_data.run_callbacks  = params.run_callbacks;
+        last_cpu_fft_data.precision      = params.precision;
+    }
+
+    if(compare_output.valid())
+        compare_output.get();
+
+    if(!store_to_cache)
+        store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
+
+    Tparams params_inverse;
+
+    if(round_trip)
+    {
+        params_inverse.inverse_from_forward(params);
+
+        run_round_trip_inverse<Tparams>(
+            params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data);
+    }
+
+    if(fftw_compare)
+    {
+        ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2));
+        ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf));
+
+        ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2));
+        ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf));
+
+        if(verbose > 1)
+        {
+            std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
+            std::cout << "GPU output L2 norm:   " << gpu_norm.get().l_2 << "\n";
+            std::cout << "GPU linf norm failures:";
+            std::sort(linf_failures->begin(), linf_failures->end());
+            for(const auto& i : *linf_failures)
+            {
+                std::cout << " (" << i.first << "," << i.second << ")";
+            }
+            std::cout << std::endl;
+        }
+
+        EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
+        EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
+    }
+
+    switch(params.precision)
+    {
+    case fft_precision_half:
+        max_linf_eps_half
+            = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
+        max_l2_eps_half
+            = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    case fft_precision_single:
+        max_linf_eps_single
+            = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
+        max_l2_eps_single = std::max(max_l2_eps_single,
+                                     diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    case fft_precision_double:
+        max_linf_eps_double
+            = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
+        max_l2_eps_double = std::max(max_l2_eps_double,
+                                     diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
+        break;
+    }
+
+    if(verbose > 1)
+    {
+        std::cout << "L2 diff: " << diff.l_2 << "\n";
+        std::cout << "Linf diff: " << diff.l_inf << "\n";
+    }
+
+    if(fftw_compare)
+    {
+        EXPECT_TRUE(diff.l_inf <= linf_cutoff)
+            << "Linf test failed.  Linf:" << diff.l_inf
+            << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf
+            << "\tcutoff: " << linf_cutoff << params.str();
+
+        EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2
+                    < sqrt(log2(total_length)) * type_epsilon(params.precision))
+            << "L2 test failed. L2: " << diff.l_2
+            << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2
+            << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
+            << params.str();
+    }
+
+    if(round_trip && fftw_compare)
+    {
+        compare_round_trip_inverse<Tparams>(params_inverse,
+                                            contiguous_params,
+                                            gpu_input_data,
+                                            cpu_input,
+                                            cpu_input_norm.get(),
+                                            total_length);
+    }
+}
+
+#endif
diff --git a/shared/arithmetic.h b/shared/arithmetic.h
new file mode 100644
index 0000000..774d342
--- /dev/null
+++ b/shared/arithmetic.h
@@ -0,0 +1,61 @@
+/******************************************************************************
+* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#pragma once
+
+#include <numeric>
+#include <stddef.h>
+
+// arithmetic helper functions
+
+static inline bool IsPo2(size_t u)
+{
+    return (u != 0) && (0 == (u & (u - 1)));
+}
+
+//	help function: Find the smallest power of 2 that is >= n; return its
+//  power of 2 factor
+//	e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
+static inline size_t CeilPo2(size_t n)
+{
+    size_t v = 1, t = 0;
+    while(v < n)
+    {
+        v <<= 1;
+        t++;
+    }
+
+    return t;
+}
+
+template <typename T>
+static inline T DivRoundingUp(T a, T b)
+{
+    return (a + (b - 1)) / b;
+}
+
+template <typename Titer>
+typename Titer::value_type product(Titer begin, Titer end)
+{
+    return std::accumulate(
+        begin, end, typename Titer::value_type(1), std::multiplies<typename Titer::value_type>());
+}
diff --git a/shared/array_predicate.h b/shared/array_predicate.h
new file mode 100644
index 0000000..92e45b4
--- /dev/null
+++ b/shared/array_predicate.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_ARRAY_PREDICATE_H
+#define ROCFFT_ARRAY_PREDICATE_H
+
+#include "rocfft/rocfft.h"
+
+namespace
+{
+    bool array_type_is_complex(rocfft_array_type type)
+    {
+        return type == rocfft_array_type_complex_interleaved
+               || type == rocfft_array_type_complex_planar
+               || type == rocfft_array_type_hermitian_interleaved
+               || type == rocfft_array_type_hermitian_planar;
+    }
+    bool array_type_is_interleaved(rocfft_array_type type)
+    {
+        return type == rocfft_array_type_complex_interleaved
+               || type == rocfft_array_type_hermitian_interleaved;
+    }
+    bool array_type_is_planar(rocfft_array_type type)
+    {
+        return type == rocfft_array_type_complex_planar
+               || type == rocfft_array_type_hermitian_planar;
+    }
+}
+
+#endif
diff --git a/shared/array_validator.cpp b/shared/array_validator.cpp
new file mode 100644
index 0000000..70abb08
--- /dev/null
+++ b/shared/array_validator.cpp
@@ -0,0 +1,549 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <iostream>
+#include <numeric>
+#include <unordered_set>
+
+#include "array_validator.h"
+#include "increment.h"
+
+// Check a 2D array for collisions.
+// The 2D case can be determined via a number-theoretic argument.
+bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1)
+{
+    if(s0 == s1)
+        return false;
+    const auto c = std::lcm(s0, s1);
+    return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c));
+}
+
+// Compare a 1D direction with a multi-index hyperface for collisions.
+bool valid_length_stride_1d_multi(const unsigned int        idx,
+                                  const std::vector<size_t> l,
+                                  const std::vector<size_t> s,
+                                  const int                 verbose)
+{
+    size_t              l0{0}, s0{0};
+    std::vector<size_t> l1{}, s1{};
+    for(unsigned int i = 0; i < l.size(); ++i)
+    {
+        if(i == idx)
+        {
+            l0 = l[i];
+            s0 = s[i];
+        }
+        else
+        {
+            l1.push_back(l[i]);
+            s1.push_back(s[i]);
+        }
+    }
+
+    if(verbose > 4)
+    {
+        std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
+    }
+
+    // We only need to go to the maximum pointer offset for (l1,s1).
+    const auto max_offset
+        = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
+          - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
+    std::unordered_set<size_t> a0{};
+    for(size_t i = 1; i < l0; ++i)
+    {
+        const auto val = i * s0;
+        if(val <= max_offset)
+            a0.insert(val);
+        else
+            break;
+    }
+
+    if(verbose > 5)
+    {
+        std::cout << "a0:";
+        for(auto i : a0)
+            std::cout << " " << i;
+        std::cout << std::endl;
+
+        std::cout << "l1:";
+        for(auto i : l1)
+            std::cout << " " << i;
+        std::cout << std::endl;
+
+        std::cout << "s1:";
+        for(auto i : s1)
+            std::cout << " " << i;
+        std::cout << std::endl;
+    }
+
+    // TODO: this can be multi-threaded, since find(...) is thread-safe.
+    std::vector<size_t> index(l1.size());
+    std::fill(index.begin(), index.end(), 0);
+    do
+    {
+        const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0);
+        if(i > 0 && (i % s0 == 0))
+        {
+            // TODO: use an ordered set and binary search
+            if(verbose > 6)
+                std::cout << i << std::endl;
+            if(a0.find(i) != a0.end())
+            {
+                if(verbose > 4)
+                {
+                    std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
+                    std::cout << "l1:";
+                    for(const auto li : l1)
+                        std::cout << " " << li;
+                    std::cout << " s1:";
+                    for(const auto si : s1)
+                        std::cout << " " << si;
+                    std::cout << std::endl;
+                    std::cout << "Found duplicate: " << i << std::endl;
+                }
+                return false;
+            }
+        }
+    } while(increment_rowmajor(index, l1));
+
+    return true;
+}
+
+// Compare a hyperface with another hyperface for collisions.
+bool valid_length_stride_multi_multi(const std::vector<size_t> l0,
+                                     const std::vector<size_t> s0,
+                                     const std::vector<size_t> l1,
+                                     const std::vector<size_t> s1)
+{
+    std::unordered_set<size_t> a0{};
+
+    const auto max_offset
+        = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
+          - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
+    std::vector<size_t> index0(l0.size()); // TODO: check this
+    std::fill(index0.begin(), index0.end(), 0);
+    do
+    {
+        const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0);
+        if(i > max_offset)
+            a0.insert(i);
+    } while(increment_rowmajor(index0, l0));
+
+    std::vector<size_t> index1(l1.size());
+    std::fill(index1.begin(), index1.end(), 0);
+    do
+    {
+        const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0);
+        if(i > 0)
+        {
+            // TODO: use an ordered set and binary search
+            if(a0.find(i) != a0.end())
+            {
+
+                return false;
+            }
+        }
+    } while(increment_rowmajor(index1, l1));
+
+    return true;
+}
+
+bool valid_length_stride_3d(const std::vector<size_t>& l,
+                            const std::vector<size_t>& s,
+                            const int                  verbose)
+{
+    // Check that 2D faces are valid:
+    if(!valid_length_stride_2d(l[0], l[1], s[0], s[1]))
+        return false;
+    if(!valid_length_stride_2d(l[0], l[2], s[0], s[2]))
+        return false;
+    if(!valid_length_stride_2d(l[1], l[2], s[1], s[2]))
+        return false;
+
+    // If the 2D faces are valid, check an axis vs a face for collisions:
+    bool invalid = false;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for(int idx = 0; idx < 3; ++idx)
+    {
+        if(!valid_length_stride_1d_multi(idx, l, s, verbose))
+        {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+            invalid = true;
+        }
+    }
+    if(invalid)
+        return false;
+    return true;
+}
+
+bool valid_length_stride_4d(const std::vector<size_t>& l,
+                            const std::vector<size_t>& s,
+                            const int                  verbose)
+{
+    if(l.size() != 4)
+    {
+        throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d");
+    }
+
+    // Check that 2D faces are valid:
+    for(int idx0 = 0; idx0 < 3; ++idx0)
+    {
+        for(int idx1 = idx0 + 1; idx1 < 4; ++idx1)
+        {
+            if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1]))
+                return false;
+        }
+    }
+
+    bool invalid = false;
+    // Check that 1D vs 3D faces are valid:
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for(int idx0 = 0; idx0 < 4; ++idx0)
+    {
+        if(!valid_length_stride_1d_multi(idx0, l, s, verbose))
+        {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+            invalid = true;
+        }
+    }
+    if(invalid)
+        return false;
+
+    // Check that 2D vs 2D faces are valid:
+
+    // First, get all the permutations
+    std::vector<std::vector<size_t>> perms;
+    std::vector<size_t>              v(l.size());
+    std::fill(v.begin(), v.begin() + 2, 0);
+    std::fill(v.begin() + 2, v.end(), 1);
+    do
+    {
+        perms.push_back(v);
+        if(verbose > 3)
+        {
+            std::cout << "v:";
+            for(const auto i : v)
+            {
+                std::cout << " " << i;
+            }
+            std::cout << "\n";
+        }
+    } while(std::next_permutation(v.begin(), v.end()));
+
+    // Then loop over all of the permutations.
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for(size_t iperm = 0; iperm < perms.size(); ++iperm)
+    {
+        std::vector<size_t> l0(2);
+        std::vector<size_t> s0(2);
+        std::vector<size_t> l1(2);
+        std::vector<size_t> s1(2);
+        for(size_t i = 0; i < l.size(); ++i)
+        {
+            if(perms[iperm][i] == 0)
+            {
+                l0.push_back(l[i]);
+                s0.push_back(s[i]);
+            }
+            else
+            {
+                l1.push_back(l[i]);
+                s1.push_back(s[i]);
+            }
+        }
+
+        if(verbose > 3)
+        {
+            std::cout << "\tl0:";
+            for(const auto i : l0)
+            {
+                std::cout << " " << i;
+            }
+            std::cout << "\n";
+            std::cout << "\ts0:";
+            for(const auto i : s0)
+            {
+                std::cout << " " << i;
+            }
+            std::cout << "\n";
+            std::cout << "\tl1:";
+            for(const auto i : l1)
+            {
+                std::cout << " " << i;
+            }
+            std::cout << "\n";
+            std::cout << "\ts1:";
+            for(const auto i : s1)
+            {
+                std::cout << " " << i;
+            }
+            std::cout << "\n";
+        }
+
+        if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
+        {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+            invalid = true;
+        }
+    }
+    if(invalid)
+        return false;
+
+    return true;
+}
+
+bool valid_length_stride_generald(const std::vector<size_t> l,
+                                  const std::vector<size_t> s,
+                                  const int                 verbose)
+{
+    if(verbose > 2)
+    {
+        std::cout << "checking dimension " << l.size() << std::endl;
+    }
+
+    // Recurse on d-1 hyper-faces:
+    for(unsigned int idx = 0; idx < l.size(); ++idx)
+    {
+        std::vector<size_t> l0{};
+        std::vector<size_t> s0{};
+        for(size_t i = 0; i < l.size(); ++i)
+        {
+            if(i != idx)
+            {
+                l0.push_back(l[i]);
+                s0.push_back(s[i]);
+            }
+        }
+        if(!array_valid(l0, s0, verbose))
+            return false;
+    }
+
+    // Handle the 1D vs (N-1) case:
+    for(unsigned int idx = 0; idx < l.size(); ++idx)
+    {
+        if(!valid_length_stride_1d_multi(idx, l, s, verbose))
+            return false;
+    }
+
+    for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0)
+    {
+        const size_t dim1 = l.size() - dim0;
+        if(verbose > 2)
+            std::cout << "dims: " << dim0 << " " << dim1 << std::endl;
+
+        // We iterate over all permutations of an array of length l.size() which contains dim0 zeros
+        // and dim1 ones.  We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the
+        // possibilities.
+
+        // First, get all the permutations
+        std::vector<std::vector<size_t>> perms;
+        std::vector<size_t>              v(l.size());
+        std::fill(v.begin(), v.begin() + dim1, 0);
+        std::fill(v.begin() + dim1, v.end(), 1);
+        do
+        {
+            perms.push_back(v);
+            if(verbose > 3)
+            {
+                std::cout << "v:";
+                for(const auto i : v)
+                {
+                    std::cout << " " << i;
+                }
+                std::cout << "\n";
+            }
+
+        } while(std::next_permutation(v.begin(), v.end()));
+
+        bool invalid = false;
+        // Then loop over all of the permutations.
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for(size_t iperm = 0; iperm < perms.size(); ++iperm)
+        {
+            std::vector<size_t> l0(dim0);
+            std::vector<size_t> s0(dim0);
+            std::vector<size_t> l1(dim1);
+            std::vector<size_t> s1(dim1);
+
+            for(size_t i = 0; i < l.size(); ++i)
+            {
+                if(v[i] == 0)
+                {
+                    l0.push_back(l[i]);
+                    s0.push_back(s[i]);
+                }
+                else
+                {
+                    l1.push_back(l[i]);
+                    s1.push_back(s[i]);
+                }
+            }
+
+            if(verbose > 3)
+            {
+                std::cout << "\tl0:";
+                for(const auto i : l0)
+                {
+                    std::cout << " " << i;
+                }
+                std::cout << "\n";
+                std::cout << "\ts0:";
+                for(const auto i : s0)
+                {
+                    std::cout << " " << i;
+                }
+                std::cout << "\n";
+                std::cout << "\tl1:";
+                for(const auto i : l1)
+                {
+                    std::cout << " " << i;
+                }
+                std::cout << "\n";
+                std::cout << "\ts1:";
+                for(const auto i : s1)
+                {
+                    std::cout << " " << i;
+                }
+                std::cout << "\n";
+            }
+
+            if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
+            {
+#ifdef _OPENMP
+#pragma omp cancel for
+#endif
+                invalid = true;
+            }
+        }
+        if(invalid)
+            return false;
+    }
+
+    return true;
+}
+
+bool sort_by_stride(const std::pair<size_t, size_t>& ls0, const std::pair<size_t, size_t>& ls1)
+{
+    return ls0.second < ls1.second;
+}
+
+bool array_valid(const std::vector<size_t>& length,
+                 const std::vector<size_t>& stride,
+                 const int                  verbose)
+{
+    if(length.size() != stride.size())
+        return false;
+
+    // If a length is 1, then the stride is irrelevant.
+    // If a length is > 1, then the corresponding stride must be > 1.
+    std::vector<size_t> l{}, s{};
+    for(unsigned int i = 0; i < length.size(); ++i)
+    {
+        if(length[i] > 1)
+        {
+            if(stride[i] == 0)
+                return false;
+            l.push_back(length[i]);
+            s.push_back(stride[i]);
+        }
+    }
+
+    if(length.size() > 1)
+    {
+        // Check happy path.
+        bool                                   happy_path = true;
+        std::vector<std::pair<size_t, size_t>> ls;
+        for(size_t idx = 0; idx < length.size(); ++idx)
+        {
+            ls.push_back(std::pair(length[idx], stride[idx]));
+        }
+        std::sort(ls.begin(), ls.end(), sort_by_stride);
+
+        if(verbose > 2)
+        {
+            for(size_t idx = 0; idx < ls.size(); ++idx)
+            {
+                std::cout << ls[idx].first << "\t" << ls[idx].second << "\n";
+            }
+        }
+
+        for(size_t idx = 1; idx < ls.size(); ++idx)
+        {
+            if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second)
+            {
+                happy_path = false;
+                break;
+            }
+        }
+        if(happy_path)
+        {
+            if(verbose > 2)
+            {
+                std::cout << "happy path\n";
+            }
+            return true;
+        }
+    }
+
+    switch(l.size())
+    {
+    case 0:
+        return true;
+        break;
+    case 1:
+        return s[0] != 0;
+        break;
+    case 2:
+    {
+        return valid_length_stride_2d(l[0], l[1], s[0], s[1]);
+        break;
+    }
+    case 3:
+    {
+        return valid_length_stride_3d(l, s, verbose);
+        break;
+    }
+    case 4:
+    {
+        return valid_length_stride_4d(l, s, verbose);
+        break;
+    }
+    default:
+        return valid_length_stride_generald(l, s, verbose);
+        return true;
+    }
+
+    return true;
+}
diff --git a/shared/array_validator.h b/shared/array_validator.h
new file mode 100644
index 0000000..ce85173
--- /dev/null
+++ b/shared/array_validator.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ARRAY_VALIDATOR_H
+#define ARRAY_VALIDATOR_H
+
+#include <vector>
+
+// Checks whether the array with given length and stride has multi-index collisions.
+bool array_valid(const std::vector<size_t>& length,
+                 const std::vector<size_t>& stride,
+                 const int                  verbose = 0);
+
+#endif
diff --git a/shared/concurrency.h b/shared/concurrency.h
new file mode 100644
index 0000000..a36c7c1
--- /dev/null
+++ b/shared/concurrency.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include <thread>
+
+#ifndef WIN32
+#include <sched.h>
+#endif
+
+// work out how many parallel tasks to run, based on available
+// resources.  on Linux, this will look at the cpu affinity mask (if
+// available) which might be restricted in a container.  otherwise,
+// return std::thread::hardware_concurrency().
+static unsigned int rocfft_concurrency()
+{
+#ifndef WIN32
+    cpu_set_t cpuset;
+    if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0)
+        return CPU_COUNT(&cpuset);
+#endif
+    return std::thread::hardware_concurrency();
+}
diff --git a/shared/data_gen_device.h b/shared/data_gen_device.h
new file mode 100644
index 0000000..77fb012
--- /dev/null
+++ b/shared/data_gen_device.h
@@ -0,0 +1,1303 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef DATA_GEN_DEVICE_H
+#define DATA_GEN_DEVICE_H
+
+// rocRAND can generate warnings if inline asm is not available for
+// some architectures.  data generation isn't performance-critical,
+// so just disable inline asm to prevent the warnings.
+#define ROCRAND_DISABLE_INLINE_ASM
+
+#include "../shared/arithmetic.h"
+#include "../shared/device_properties.h"
+#include "../shared/gpubuf.h"
+#include "../shared/increment.h"
+#include "../shared/rocfft_complex.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand_kernel.h>
+#include <limits>
+#include <vector>
+
+static const unsigned int DATA_GEN_THREADS    = 8;
+static const unsigned int DATA_GEN_GRID_Y_MAX = 64;
+
+template <typename T>
+struct input_val_1D
+{
+    T val1;
+};
+
+template <typename T>
+struct input_val_2D
+{
+    T val1;
+    T val2;
+};
+
+template <typename T>
+struct input_val_3D
+{
+    T val1;
+    T val2;
+    T val3;
+};
+
+template <typename T>
+static input_val_1D<T> get_input_val(const T& val)
+{
+    return input_val_1D<T>{val};
+}
+
+template <typename T>
+static input_val_2D<T> get_input_val(const std::tuple<T, T>& val)
+{
+    return input_val_2D<T>{std::get<0>(val), std::get<1>(val)};
+}
+
+template <typename T>
+static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val)
+{
+    return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)};
+}
+
+template <typename T>
+__device__ static size_t
+    compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base)
+{
+    return (length.val1 * stride.val1) + base;
+}
+
+template <typename T>
+__device__ static size_t
+    compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base)
+{
+    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base;
+}
+
+template <typename T>
+__device__ static size_t
+    compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base)
+{
+    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3)
+           + base;
+}
+
+template <typename T>
+static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length)
+{
+    return input_val_1D<T>{0};
+}
+
+template <typename T>
+static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length)
+{
+    return input_val_2D<T>{0, 0};
+}
+
+template <typename T>
+static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length)
+{
+    return input_val_3D<T>{0, 0, 0};
+}
+
+template <typename T>
+static inline input_val_1D<T> make_unit_stride(const input_val_1D<T>& whole_length)
+{
+    return input_val_1D<T>{1};
+}
+
+template <typename T>
+static inline input_val_2D<T> make_unit_stride(const input_val_2D<T>& whole_length)
+{
+    return input_val_2D<T>{1, whole_length.val1};
+}
+
+template <typename T>
+static inline input_val_3D<T> make_unit_stride(const input_val_3D<T>& whole_length)
+{
+    return input_val_3D<T>{1, whole_length.val1, whole_length.val1 * whole_length.val2};
+}
+
+template <typename T>
+__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+
+    auto xidx = i % xlen;
+
+    return input_val_1D<T>{xidx};
+}
+
+template <typename T>
+__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+    auto ylen = whole_length.val2;
+
+    auto xidx = i % xlen;
+    auto yidx = i / xlen % ylen;
+
+    return input_val_2D<T>{xidx, yidx};
+}
+
+template <typename T>
+__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+    auto ylen = whole_length.val2;
+    auto zlen = whole_length.val3;
+
+    auto xidx = i % xlen;
+    auto yidx = i / xlen % ylen;
+    auto zidx = i / xlen / ylen % zlen;
+
+    return input_val_3D<T>{xidx, yidx, zidx};
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+
+    auto yidx = i / xlen;
+
+    return yidx;
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length)
+{
+    auto xlen = whole_length.val1;
+    auto ylen = whole_length.val2;
+
+    auto zidx = i / xlen / ylen;
+
+    return zidx;
+}
+
+template <typename T>
+__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length)
+{
+    auto xlen = length.val1;
+    auto ylen = length.val2;
+    auto zlen = length.val3;
+
+    auto widx = i / xlen / ylen / zlen;
+
+    return widx;
+}
+
+__device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset)
+{
+    return hiprand_uniform_double(gen_state) + offset;
+}
+
+__device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset)
+{
+    return hiprand_uniform(gen_state) + offset;
+}
+
+__device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset)
+{
+    return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset;
+}
+
+template <typename Tcomplex>
+__device__ static void set_imag_zero(const size_t pos, Tcomplex* x)
+{
+    x[pos].y = 0.0;
+}
+
+template <typename Tfloat>
+__device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag)
+{
+    ximag[pos] = 0.0;
+}
+
+template <typename Tcomplex>
+__device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x)
+{
+    x[pos].x = x[cpos].x;
+    x[pos].y = -x[cpos].y;
+}
+
+template <typename Tfloat>
+__device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag)
+{
+    xreal[pos] = xreal[cpos];
+    ximag[pos] = -ximag[cpos];
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_random_interleaved_data_kernel(const Tint             whole_length,
+                                            const Tint             zero_length,
+                                            const size_t           idist,
+                                            const size_t           isize,
+                                            const Tint             istride,
+                                            rocfft_complex<Treal>* data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        auto i_length = get_length(i, whole_length);
+        auto i_batch  = get_batch(i, whole_length);
+        auto i_base   = i_batch * idist;
+
+        auto seed = compute_index(zero_length, istride, i_base);
+        auto idx  = compute_index(i_length, istride, i_base);
+
+        hiprandStatePhilox4_32_10 gen_state;
+        hiprand_init(seed, idx, 0, &gen_state);
+
+        data[idx].x = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+        data[idx].y = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+    }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_interleaved_data_kernel(const Tint             whole_length,
+                                     const size_t           idist,
+                                     const size_t           isize,
+                                     const Tint             istride,
+                                     const Tint             ustride,
+                                     const Treal            inv_scale,
+                                     rocfft_complex<Treal>* data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        const auto i_length = get_length(i, whole_length);
+        const auto i_batch  = get_batch(i, whole_length);
+        const auto i_base   = i_batch * idist;
+
+        const auto val = static_cast<Treal>(-0.5)
+                         + static_cast<Treal>(
+                               static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
+                               * inv_scale;
+
+        const auto idx = compute_index(i_length, istride, i_base);
+
+        data[idx].x = val;
+        data[idx].y = val;
+    }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_random_planar_data_kernel(const Tint   whole_length,
+                                       const Tint   zero_length,
+                                       const size_t idist,
+                                       const size_t isize,
+                                       const Tint   istride,
+                                       Treal*       real_data,
+                                       Treal*       imag_data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        auto i_length = get_length(i, whole_length);
+        auto i_batch  = get_batch(i, whole_length);
+        auto i_base   = i_batch * idist;
+
+        auto seed = compute_index(zero_length, istride, i_base);
+        auto idx  = compute_index(i_length, istride, i_base);
+
+        hiprandStatePhilox4_32_10 gen_state;
+        hiprand_init(seed, idx, 0, &gen_state);
+
+        real_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+        imag_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+    }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_planar_data_kernel(const Tint   whole_length,
+                                const size_t idist,
+                                const size_t isize,
+                                const Tint   istride,
+                                const Tint   ustride,
+                                const Treal  inv_scale,
+                                Treal*       real_data,
+                                Treal*       imag_data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        const auto i_length = get_length(i, whole_length);
+        const auto i_batch  = get_batch(i, whole_length);
+        const auto i_base   = i_batch * idist;
+
+        const auto val = static_cast<Treal>(-0.5)
+                         + static_cast<Treal>(
+                               static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
+                               * inv_scale;
+
+        const auto idx = compute_index(i_length, istride, i_base);
+
+        real_data[idx] = val;
+        imag_data[idx] = val;
+    }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_random_real_data_kernel(const Tint   whole_length,
+                                     const Tint   zero_length,
+                                     const size_t idist,
+                                     const size_t isize,
+                                     const Tint   istride,
+                                     Treal*       data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        auto i_length = get_length(i, whole_length);
+        auto i_batch  = get_batch(i, whole_length);
+        auto i_base   = i_batch * idist;
+
+        auto seed = compute_index(zero_length, istride, i_base);
+        auto idx  = compute_index(i_length, istride, i_base);
+
+        hiprandStatePhilox4_32_10 gen_state;
+        hiprand_init(seed, idx, 0, &gen_state);
+
+        data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
+    }
+}
+
+template <typename Tint, typename Treal>
+__global__ static void __launch_bounds__(DATA_GEN_THREADS)
+    generate_real_data_kernel(const Tint   whole_length,
+                              const size_t idist,
+                              const size_t isize,
+                              const Tint   istride,
+                              const Tint   ustride,
+                              const Treal  inv_scale,
+                              Treal*       data)
+{
+    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
+                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
+    static_assert(sizeof(i) >= sizeof(isize));
+    if(i < isize)
+    {
+        const auto i_length = get_length(i, whole_length);
+        const auto i_batch  = get_batch(i, whole_length);
+        const auto i_base   = i_batch * idist;
+
+        const auto val = static_cast<Treal>(-0.5)
+                         + static_cast<Treal>(
+                               static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
+                               * inv_scale;
+
+        const auto idx = compute_index(i_length, istride, i_base);
+
+        data[idx] = val;
+    }
+}
+
+// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
+// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
+// space.  For multi-dimensional data, this means that we only need to store a bit more
+// than half of the complex values; the rest are redundant.  However, there are still
+// some restrictions:
+// * the origin and Nyquist value(s) must be real-valued
+// * some of the remaining values are still redundant, and you might get different results
+//   than you expect if the values don't agree.
+
+template <typename Tcomplex>
+__global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex*    x,
+                                                                       const size_t Nx,
+                                                                       const size_t xstride,
+                                                                       const size_t dist,
+                                                                       const size_t batch_total,
+                                                                       const bool   Nxeven)
+{
+    auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+    static_assert(sizeof(id_batch) == sizeof(size_t));
+
+    if(id_batch < batch_total)
+    {
+        id_batch *= dist;
+
+        set_imag_zero(id_batch, x);
+
+        if(Nxeven)
+            set_imag_zero(id_batch + (Nx / 2) * xstride, x);
+    }
+}
+
+template <typename Tfloat>
+__global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat*      xreal,
+                                                                  Tfloat*      ximag,
+                                                                  const size_t Nx,
+                                                                  const size_t xstride,
+                                                                  const size_t dist,
+                                                                  const size_t batch_total,
+                                                                  const bool   Nxeven)
+{
+    auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+    static_assert(sizeof(id_batch) == sizeof(size_t));
+
+    if(id_batch < batch_total)
+    {
+        id_batch *= dist;
+
+        set_imag_zero(id_batch, xreal, ximag);
+
+        if(Nxeven)
+            set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
+    }
+}
+
+template <typename Tcomplex>
+__global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex*    x,
+                                                                       const size_t Nx,
+                                                                       const size_t Ny,
+                                                                       const size_t xstride,
+                                                                       const size_t ystride,
+                                                                       const size_t dist,
+                                                                       const size_t batch_total,
+                                                                       const size_t x_total,
+                                                                       const bool   Nxeven,
+                                                                       const bool   Nyeven)
+{
+    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+    static_assert(sizeof(id_batch) == sizeof(size_t));
+    static_assert(sizeof(id_x) == sizeof(size_t));
+
+    if(id_batch < batch_total)
+    {
+        id_batch *= dist;
+
+        if(id_x == 0)
+            set_imag_zero(id_batch, x);
+
+        if(id_x == 0 && Nxeven)
+            set_imag_zero(id_batch + (Nx / 2) * xstride, x);
+
+        if(id_x == 0 && Nyeven)
+            set_imag_zero(id_batch + ystride * (Ny / 2), x);
+
+        if(id_x == 0 && Nxeven && Nyeven)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
+
+        if(id_x < x_total)
+        {
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
+
+            if(Nyeven)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+                          id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+                          x);
+        }
+    }
+}
+
+template <typename Tfloat>
+__global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat*      xreal,
+                                                                  Tfloat*      ximag,
+                                                                  const size_t Nx,
+                                                                  const size_t Ny,
+                                                                  const size_t xstride,
+                                                                  const size_t ystride,
+                                                                  const size_t dist,
+                                                                  const size_t batch_total,
+                                                                  const size_t x_total,
+                                                                  const bool   Nxeven,
+                                                                  const bool   Nyeven)
+{
+    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+    static_assert(sizeof(id_batch) == sizeof(size_t));
+    static_assert(sizeof(id_x) == sizeof(size_t));
+
+    if(id_batch < batch_total)
+    {
+        id_batch *= dist;
+
+        if(id_x == 0)
+            set_imag_zero(id_batch, xreal, ximag);
+
+        if(id_x == 0 && Nxeven)
+            set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
+
+        if(id_x == 0 && Nyeven)
+            set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
+
+        if(id_x == 0 && Nxeven && Nyeven)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
+
+        if(id_x < x_total)
+        {
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)),
+                      id_batch + xstride * (id_x + 1),
+                      xreal,
+                      ximag);
+
+            if(Nyeven)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+                          id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+                          xreal,
+                          ximag);
+        }
+    }
+}
+
+template <typename Tcomplex>
+__global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex*    x,
+                                                                       const size_t Nx,
+                                                                       const size_t Ny,
+                                                                       const size_t Nz,
+                                                                       const size_t xstride,
+                                                                       const size_t ystride,
+                                                                       const size_t zstride,
+                                                                       const size_t dist,
+                                                                       const size_t batch_total,
+                                                                       const size_t x_total,
+                                                                       const size_t y_total,
+                                                                       const size_t y_total_half,
+                                                                       const bool   Nxeven,
+                                                                       const bool   Nyeven,
+                                                                       const bool   Nzeven)
+{
+    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+    const auto id_y     = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
+    static_assert(sizeof(id_batch) == sizeof(size_t));
+    static_assert(sizeof(id_x) == sizeof(size_t));
+    static_assert(sizeof(id_y) == sizeof(size_t));
+
+    if(id_batch < batch_total)
+    {
+        auto id_x_y_zero = (id_x == 0 && id_y == 0);
+
+        id_batch *= dist;
+
+        if(id_x_y_zero)
+            set_imag_zero(id_batch, x);
+
+        if(Nxeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2), x);
+
+        if(Nyeven && id_x_y_zero)
+            set_imag_zero(id_batch + ystride * (Ny / 2), x);
+
+        if(Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + zstride * (Nz / 2), x);
+
+        if(Nxeven && Nyeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
+
+        if(Nxeven && Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x);
+
+        if(Nyeven && Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x);
+
+        if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
+                          x);
+
+        if(id_x == 0 && id_y < y_total_half)
+            conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x);
+
+        if(Nxeven && id_x == 0 && id_y < y_total_half)
+            conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
+                      id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
+                      x);
+
+        if(id_x < x_total && id_y == 0)
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
+
+        if(Nyeven && id_x < x_total && id_y == 0)
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+                      id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+                      x);
+
+        if(id_x < x_total && id_y < y_total)
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
+                      id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
+                      x);
+
+        if(Nzeven)
+        {
+            if(id_x < x_total && id_y == 0)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+                          x);
+
+            if(Nyeven && id_x < x_total && id_y == 0)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+                          x);
+
+            if(id_x == 0 && id_y < y_total_half)
+                conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
+                          id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
+                          x);
+
+            if(Nxeven && id_x == 0 && id_y < y_total_half)
+                conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
+                              + zstride * (Nz / 2),
+                          id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
+                          x);
+
+            if(id_x < x_total && id_y < y_total)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
+                              + zstride * (Nz / 2),
+                          id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
+                              + zstride * (Nz / 2),
+                          x);
+        }
+    }
+}
+
+template <typename Tfloat>
+__global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat*      xreal,
+                                                                  Tfloat*      ximag,
+                                                                  const size_t Nx,
+                                                                  const size_t Ny,
+                                                                  const size_t Nz,
+                                                                  const size_t xstride,
+                                                                  const size_t ystride,
+                                                                  const size_t zstride,
+                                                                  const size_t dist,
+                                                                  const size_t batch_total,
+                                                                  const size_t x_total,
+                                                                  const size_t y_total,
+                                                                  const size_t y_total_half,
+                                                                  const bool   Nxeven,
+                                                                  const bool   Nyeven,
+                                                                  const bool   Nzeven)
+{
+    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
+    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
+    const auto id_y     = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
+    static_assert(sizeof(id_batch) == sizeof(size_t));
+    static_assert(sizeof(id_x) == sizeof(size_t));
+    static_assert(sizeof(id_y) == sizeof(size_t));
+
+    if(id_batch < batch_total)
+    {
+        auto id_x_y_zero = (id_x == 0 && id_y == 0);
+
+        id_batch *= dist;
+
+        if(id_x_y_zero)
+            set_imag_zero(id_batch, xreal, ximag);
+
+        if(Nxeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag);
+
+        if(Nyeven && id_x_y_zero)
+            set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
+
+        if(Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag);
+
+        if(Nxeven && Nyeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
+
+        if(Nxeven && Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag);
+
+        if(Nyeven && Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag);
+
+        if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
+            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
+                          xreal,
+                          ximag);
+
+        if(id_x == 0 && id_y < y_total_half)
+            conjugate(id_batch + ystride * (Ny - (id_y + 1)),
+                      id_batch + ystride * (id_y + 1),
+                      xreal,
+                      ximag);
+
+        if(Nxeven && id_x == 0 && id_y < y_total_half)
+            conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
+                      id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
+                      xreal,
+                      ximag);
+
+        if(id_x < x_total && id_y == 0)
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)),
+                      id_batch + xstride * (id_x + 1),
+                      xreal,
+                      ximag);
+
+        if(Nyeven && id_x < x_total && id_y == 0)
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
+                      id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
+                      xreal,
+                      ximag);
+
+        if(id_x < x_total && id_y < y_total)
+            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
+                      id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
+                      xreal,
+                      ximag);
+
+        if(Nzeven)
+        {
+            if(id_x < x_total && id_y == 0)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+                          xreal,
+                          ximag);
+
+            if(Nyeven && id_x < x_total && id_y == 0)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
+                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
+                          xreal,
+                          ximag);
+
+            if(id_x == 0 && id_y < y_total_half)
+                conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
+                          id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
+                          xreal,
+                          ximag);
+
+            if(Nxeven && id_x == 0 && id_y < y_total_half)
+                conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
+                              + zstride * (Nz / 2),
+                          id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
+                          xreal,
+                          ximag);
+
+            if(id_x < x_total && id_y < y_total)
+                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
+                              + zstride * (Nz / 2),
+                          id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
+                              + zstride * (Nz / 2),
+                          xreal,
+                          ximag);
+        }
+    }
+}
+
+// get grid dimensions for data gen kernel
+static dim3 generate_data_gridDim(const size_t isize)
+{
+    auto blockSize = DATA_GEN_THREADS;
+    // total number of blocks needed in the grid
+    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
+
+    // Total work items per dimension in the grid is counted in
+    // uint32_t.  Since each thread initializes one element, very
+    // large amounts of data will overflow this total size if we do
+    // all this work in one grid dimension, causing launch failure.
+    //
+    // CUDA also generally allows for effectively unlimited grid X
+    // dim, but Y and Z are more limited.
+    auto gridDim_y = std::min<unsigned int>(DATA_GEN_GRID_Y_MAX, numBlocks_setup);
+    auto gridDim_x = DivRoundingUp<unsigned int>(numBlocks_setup, DATA_GEN_GRID_Y_MAX);
+    return {gridDim_x, gridDim_y};
+}
+
+// get grid dimensions for hermitian symmetrizer kernel
+static dim3 generate_hermitian_gridDim(const std::vector<size_t>& length,
+                                       const size_t               batch,
+                                       const size_t               blockSize)
+{
+    dim3 gridDim;
+
+    switch(length.size())
+    {
+    case 1:
+        gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
+        break;
+    case 2:
+        gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
+                       DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize));
+        break;
+    case 3:
+        gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
+                       DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize),
+                       DivRoundingUp<size_t>(length[1] - 1, blockSize));
+        break;
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+
+    return gridDim;
+}
+
+static dim3 generate_blockDim(const std::vector<size_t>& length, const size_t blockSize)
+{
+    dim3 blockDim;
+
+    switch(length.size())
+    {
+    case 1:
+        blockDim = dim3(blockSize);
+        break;
+    case 2:
+        blockDim = dim3(blockSize, blockSize);
+        break;
+    case 3:
+        blockDim = dim3(blockSize, blockSize, blockSize);
+        break;
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+
+    return blockDim;
+}
+
+template <typename Tint, typename Treal>
+static void generate_random_interleaved_data(const Tint&            whole_length,
+                                             const size_t           idist,
+                                             const size_t           isize,
+                                             const Tint&            whole_stride,
+                                             rocfft_complex<Treal>* input_data,
+                                             const hipDeviceProp_t& deviceProp)
+{
+    auto input_length = get_input_val(whole_length);
+    auto zero_length  = make_zero_length(input_length);
+    auto input_stride = get_input_val(whole_stride);
+
+    dim3 gridDim = generate_data_gridDim(isize);
+    dim3 blockDim{DATA_GEN_THREADS};
+
+    launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generate_random_interleaved_data_kernel<decltype(input_length), Treal>),
+        gridDim,
+        blockDim,
+        0, // sharedMemBytes
+        0, // stream
+        input_length,
+        zero_length,
+        idist,
+        isize,
+        input_stride,
+        input_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_interleaved_data(const Tint&            whole_length,
+                                      const size_t           idist,
+                                      const size_t           isize,
+                                      const Tint&            whole_stride,
+                                      const size_t           nbatch,
+                                      rocfft_complex<Treal>* input_data,
+                                      const hipDeviceProp_t& deviceProp)
+{
+    const auto input_length = get_input_val(whole_length);
+    const auto input_stride = get_input_val(whole_stride);
+    const auto unit_stride  = make_unit_stride(input_length);
+
+    const auto inv_scale
+        = static_cast<Treal>(1.0)
+          / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
+
+    dim3 gridDim = generate_data_gridDim(isize);
+    dim3 blockDim{DATA_GEN_THREADS};
+
+    launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generate_interleaved_data_kernel<decltype(input_length), Treal>),
+        gridDim,
+        blockDim,
+        0, // sharedMemBytes
+        0, // stream
+        input_length,
+        idist,
+        isize,
+        input_stride,
+        unit_stride,
+        inv_scale,
+        input_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_interleaved_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_random_planar_data(const Tint&            whole_length,
+                                        const size_t           idist,
+                                        const size_t           isize,
+                                        const Tint&            whole_stride,
+                                        Treal*                 real_data,
+                                        Treal*                 imag_data,
+                                        const hipDeviceProp_t& deviceProp)
+{
+    const auto input_length = get_input_val(whole_length);
+    const auto zero_length  = make_zero_length(input_length);
+    const auto input_stride = get_input_val(whole_stride);
+
+    dim3 gridDim = generate_data_gridDim(isize);
+    dim3 blockDim{DATA_GEN_THREADS};
+
+    launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generate_random_planar_data_kernel<decltype(input_length), Treal>),
+        gridDim,
+        blockDim,
+        0, // sharedMemBytes
+        0, // stream
+        input_length,
+        zero_length,
+        idist,
+        isize,
+        input_stride,
+        real_data,
+        imag_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_random_planar_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_planar_data(const Tint&            whole_length,
+                                 const size_t           idist,
+                                 const size_t           isize,
+                                 const Tint&            whole_stride,
+                                 const size_t           nbatch,
+                                 Treal*                 real_data,
+                                 Treal*                 imag_data,
+                                 const hipDeviceProp_t& deviceProp)
+{
+    const auto input_length = get_input_val(whole_length);
+    const auto input_stride = get_input_val(whole_stride);
+    const auto unit_stride  = make_unit_stride(input_length);
+
+    const auto inv_scale
+        = static_cast<Treal>(1.0)
+          / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
+
+    dim3 gridDim = generate_data_gridDim(isize);
+    dim3 blockDim{DATA_GEN_THREADS};
+
+    launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel<decltype(input_length), Treal>),
+                       gridDim,
+                       blockDim,
+                       0, // sharedMemBytes
+                       0, // stream
+                       input_length,
+                       idist,
+                       isize,
+                       input_stride,
+                       unit_stride,
+                       inv_scale,
+                       real_data,
+                       imag_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_planar_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_random_real_data(const Tint&            whole_length,
+                                      const size_t           idist,
+                                      const size_t           isize,
+                                      const Tint&            whole_stride,
+                                      Treal*                 input_data,
+                                      const hipDeviceProp_t& deviceProp)
+{
+    const auto input_length = get_input_val(whole_length);
+    const auto zero_length  = make_zero_length(input_length);
+    const auto input_stride = get_input_val(whole_stride);
+
+    dim3 gridDim = generate_data_gridDim(isize);
+    dim3 blockDim{DATA_GEN_THREADS};
+
+    launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generate_random_real_data_kernel<decltype(input_length), Treal>),
+        gridDim,
+        blockDim,
+        0, // sharedMemBytes
+        0, // stream
+        input_length,
+        zero_length,
+        idist,
+        isize,
+        input_stride,
+        input_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_random_real_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tint, typename Treal>
+static void generate_real_data(const Tint&            whole_length,
+                               const size_t           idist,
+                               const size_t           isize,
+                               const Tint&            whole_stride,
+                               const size_t           nbatch,
+                               Treal*                 input_data,
+                               const hipDeviceProp_t& deviceProp)
+{
+    const auto input_length = get_input_val(whole_length);
+    const auto input_stride = get_input_val(whole_stride);
+    const auto unit_stride  = make_unit_stride(input_length);
+
+    const auto inv_scale
+        = static_cast<Treal>(1.0)
+          / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
+
+    dim3 gridDim = generate_data_gridDim(isize);
+    dim3 blockDim{DATA_GEN_THREADS};
+
+    launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel<decltype(input_length), Treal>),
+                       gridDim,
+                       blockDim,
+                       0, // sharedMemBytes
+                       0, // stream
+                       input_length,
+                       idist,
+                       isize,
+                       input_stride,
+                       unit_stride,
+                       inv_scale,
+                       input_data);
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("generate_real_data_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tcomplex>
+static void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length,
+                                                  const std::vector<size_t>& ilength,
+                                                  const std::vector<size_t>& stride,
+                                                  const size_t               dist,
+                                                  const size_t               batch,
+                                                  Tcomplex*                  input_data,
+                                                  const hipDeviceProp_t&     deviceProp)
+{
+    auto blockSize = DATA_GEN_THREADS;
+    auto blockDim  = generate_blockDim(length, blockSize);
+    auto gridDim   = generate_hermitian_gridDim(length, batch, blockSize);
+
+    switch(length.size())
+    {
+    case 1:
+    {
+        launch_limits_check(
+            "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp);
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel<Tcomplex>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data,
+                           length[0],
+                           stride[0],
+                           dist,
+                           batch,
+                           length[0] % 2 == 0);
+
+        break;
+    }
+    case 2:
+    {
+        launch_limits_check(
+            "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp);
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel<Tcomplex>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data,
+                           length[0],
+                           length[1],
+                           stride[0],
+                           stride[1],
+                           dist,
+                           batch,
+                           (ilength[0] + 1) / 2 - 1,
+                           length[0] % 2 == 0,
+                           length[1] % 2 == 0);
+
+        break;
+    }
+    case 3:
+    {
+        launch_limits_check(
+            "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp);
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel<Tcomplex>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data,
+                           length[0],
+                           length[1],
+                           length[2],
+                           stride[0],
+                           stride[1],
+                           stride[2],
+                           dist,
+                           batch,
+                           (ilength[0] + 1) / 2 - 1,
+                           ilength[1] - 1,
+                           (ilength[1] + 1) / 2 - 1,
+                           length[0] % 2 == 0,
+                           length[1] % 2 == 0,
+                           length[2] % 2 == 0);
+        break;
+    }
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+template <typename Tfloat>
+static void impose_hermitian_symmetry_planar(const std::vector<size_t>& length,
+                                             const std::vector<size_t>& ilength,
+                                             const std::vector<size_t>& stride,
+                                             const size_t               dist,
+                                             const size_t               batch,
+                                             Tfloat*                    input_data_real,
+                                             Tfloat*                    input_data_imag,
+                                             const hipDeviceProp_t&     deviceProp)
+{
+    auto blockSize = DATA_GEN_THREADS;
+    auto blockDim  = generate_blockDim(length, blockSize);
+    auto gridDim   = generate_hermitian_gridDim(length, batch, blockSize);
+
+    switch(length.size())
+    {
+    case 1:
+    {
+        launch_limits_check(
+            "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp);
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel<Tfloat>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data_real,
+                           input_data_imag,
+                           length[0],
+                           stride[0],
+                           dist,
+                           batch,
+                           length[0] % 2 == 0);
+
+        break;
+    }
+    case 2:
+    {
+        launch_limits_check(
+            "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp);
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel<Tfloat>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data_real,
+                           input_data_imag,
+                           length[0],
+                           length[1],
+                           stride[0],
+                           stride[1],
+                           dist,
+                           batch,
+                           (ilength[0] + 1) / 2 - 1,
+                           length[0] % 2 == 0,
+                           length[1] % 2 == 0);
+
+        break;
+    }
+    case 3:
+    {
+        launch_limits_check(
+            "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp);
+
+        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel<Tfloat>,
+                           gridDim,
+                           blockDim,
+                           0,
+                           0,
+                           input_data_real,
+                           input_data_imag,
+                           length[0],
+                           length[1],
+                           length[2],
+                           stride[0],
+                           stride[1],
+                           stride[2],
+                           dist,
+                           batch,
+                           (ilength[0] + 1) / 2 - 1,
+                           ilength[1] - 1,
+                           (ilength[1] + 1) / 2 - 1,
+                           length[0] % 2 == 0,
+                           length[1] % 2 == 0,
+                           length[2] % 2 == 0);
+        break;
+    }
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+    auto err = hipGetLastError();
+    if(err != hipSuccess)
+        throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: "
+                                 + std::string(hipGetErrorName(err)));
+}
+
+#endif // DATA_GEN_DEVICE_H
diff --git a/shared/data_gen_host.h b/shared/data_gen_host.h
new file mode 100644
index 0000000..29d3854
--- /dev/null
+++ b/shared/data_gen_host.h
@@ -0,0 +1,881 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef DATA_GEN_HOST_H
+#define DATA_GEN_HOST_H
+
+#include "../shared/hostbuf.h"
+#include "../shared/increment.h"
+#include <complex>
+#include <limits>
+#include <random>
+#include <tuple>
+#include <vector>
+
+// Specialized computation of index given 1-, 2-, 3- dimension length + stride
+template <typename T1, typename T2>
+size_t compute_index(T1 length, T2 stride, size_t base)
+{
+    return (length * stride) + base;
+}
+
+template <typename T1, typename T2>
+size_t
+    compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+    static_assert(std::is_integral<T2>::value, "Integral required.");
+    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
+           + base;
+}
+
+template <typename T1, typename T2>
+size_t compute_index(const std::tuple<T1, T1, T1>& length,
+                     const std::tuple<T2, T2, T2>& stride,
+                     size_t                        base)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+    static_assert(std::is_integral<T2>::value, "Integral required.");
+    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
+           + (std::get<2>(length) * std::get<2>(stride)) + base;
+}
+
+// count the number of total iterations for 1-, 2-, and 3-D dimensions
+template <typename T1>
+size_t count_iters(const T1& i)
+{
+    return i;
+}
+
+template <typename T1>
+size_t count_iters(const std::tuple<T1, T1>& i)
+{
+    return std::get<0>(i) * std::get<1>(i);
+}
+
+template <typename T1>
+size_t count_iters(const std::tuple<T1, T1, T1>& i)
+{
+    return std::get<0>(i) * std::get<1>(i) * std::get<2>(i);
+}
+
+template <typename T1>
+T1 make_unit_stride(const T1& whole_length)
+{
+    return static_cast<T1>(1);
+}
+
+template <typename T1>
+std::tuple<T1, T1> make_unit_stride(const std::tuple<T1, T1>& whole_length)
+{
+    return std::make_tuple(static_cast<T1>(1), static_cast<T1>(std::get<0>(whole_length)));
+}
+
+template <typename T1>
+std::tuple<T1, T1, T1> make_unit_stride(const std::tuple<T1, T1, T1>& whole_length)
+{
+    return std::make_tuple(static_cast<T1>(1),
+                           static_cast<T1>(std::get<0>(whole_length)),
+                           static_cast<T1>(std::get<0>(whole_length))
+                               * static_cast<T1>(std::get<1>(whole_length)));
+}
+
+// Work out how many partitions to break our iteration problem into
+template <typename T1>
+static size_t compute_partition_count(T1 length)
+{
+#ifdef _OPENMP
+    // we seem to get contention from too many threads, which slows
+    // things down.  particularly noticeable with mix_3D tests
+    static const size_t MAX_PARTITIONS = 8;
+    size_t              iters          = count_iters(length);
+    size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs()));
+    if(!hw_threads)
+        return 1;
+
+    // don't bother threading problem sizes that are too small. pick
+    // an arbitrary number of iterations and ensure that each thread
+    // has at least that many iterations to process
+    static const size_t MIN_ITERS_PER_THREAD = 2048;
+
+    // either use the whole CPU, or use ceil(iters/iters_per_thread)
+    return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD);
+#else
+    return 1;
+#endif
+}
+
+// Break a scalar length into some number of pieces, returning
+// [(start0, end0), (start1, end1), ...]
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+
+    // make sure we don't exceed the length
+    num_parts = std::min(length, num_parts);
+
+    std::vector<std::pair<T1, T1>> ret(num_parts);
+    auto                           partition_size = length / num_parts;
+    T1                             cur_partition  = 0;
+    for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size)
+    {
+        ret[i].first  = cur_partition;
+        ret[i].second = cur_partition + partition_size;
+    }
+    // last partition might not divide evenly, fix it up
+    ret.back().second = length;
+    return ret;
+}
+
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length)
+{
+    return partition_base(length, compute_partition_count(length));
+}
+
+// Partition on the leftmost part of the tuple, for row-major indexing
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
+    partition_rowmajor(const std::tuple<T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<0>(ret[i].first)  = partitions[i].first;
+        std::get<1>(ret[i].first)  = 0;
+        std::get<0>(ret[i].second) = partitions[i].second;
+        std::get<1>(ret[i].second) = std::get<1>(length);
+    }
+    return ret;
+}
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
+    partition_rowmajor(const std::tuple<T1, T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<0>(ret[i].first)  = partitions[i].first;
+        std::get<1>(ret[i].first)  = 0;
+        std::get<2>(ret[i].first)  = 0;
+        std::get<0>(ret[i].second) = partitions[i].second;
+        std::get<1>(ret[i].second) = std::get<1>(length);
+        std::get<2>(ret[i].second) = std::get<2>(length);
+    }
+    return ret;
+}
+
+// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
+// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
+// space.  For multi-dimensional data, this means that we only need to store a bit more
+// than half of the complex values; the rest are redundant.  However, there are still
+// some restrictions:
+// * the origin and Nyquist value(s) must be real-valued
+// * some of the remaining values are still redundant, and you might get different results
+//   than you expect if the values don't agree.
+// Below are some example kernels which impose Hermitian symmetry on a complex array
+// of the given dimensions.
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved_1D(std::vector<hostbuf>&     vals,
+                                                     const std::vector<Tsize>& length,
+                                                     const std::vector<Tsize>& istride,
+                                                     const Tsize               idist,
+                                                     const Tsize               nbatch)
+{
+    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+    {
+        auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
+
+        data[0].imag(0.0);
+
+        if(length[0] % 2 == 0)
+        {
+            data[istride[0] * (length[0] / 2)].imag(0.0);
+        }
+    }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar_1D(std::vector<hostbuf>&     vals,
+                                                const std::vector<Tsize>& length,
+                                                const std::vector<Tsize>& istride,
+                                                const Tsize               idist,
+                                                const Tsize               nbatch)
+{
+    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+    {
+        auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
+
+        data_imag[0] = 0.0;
+
+        if(length[0] % 2 == 0)
+        {
+            data_imag[istride[0] * (length[0] / 2)] = 0.0;
+        }
+    }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved_2D(std::vector<hostbuf>&     vals,
+                                                     const std::vector<Tsize>& length,
+                                                     const std::vector<Tsize>& istride,
+                                                     const Tsize               idist,
+                                                     const Tsize               nbatch)
+{
+    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+    {
+        auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
+
+        data[0].imag(0.0);
+
+        if(length[0] % 2 == 0)
+        {
+            data[istride[0] * (length[0] / 2)].imag(0.0);
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            data[istride[1] * (length[1] / 2)].imag(0.0);
+        }
+
+        if(length[0] % 2 == 0 && length[1] % 2 == 0)
+        {
+            data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
+        }
+
+        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+        {
+            data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+                    = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
+            }
+        }
+    }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar_2D(std::vector<hostbuf>&     vals,
+                                                const std::vector<Tsize>& length,
+                                                const std::vector<Tsize>& istride,
+                                                const Tsize               idist,
+                                                const Tsize               nbatch)
+{
+    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+    {
+        auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
+        auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
+
+        data_imag[0] = 0.0;
+
+        if(length[0] % 2 == 0)
+        {
+            data_imag[istride[0] * (length[0] / 2)] = 0.0;
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            data_imag[istride[1] * (length[1] / 2)] = 0.0;
+        }
+
+        if(length[0] % 2 == 0 && length[1] % 2 == 0)
+        {
+            data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
+        }
+
+        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+        {
+            data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
+            data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+                    = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
+                data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+                    = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
+            }
+        }
+    }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved_3D(std::vector<hostbuf>&     vals,
+                                                     const std::vector<Tsize>& length,
+                                                     const std::vector<Tsize>& istride,
+                                                     const Tsize               idist,
+                                                     const Tsize               nbatch)
+{
+    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+    {
+        auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
+
+        data[0].imag(0.0);
+
+        if(length[0] % 2 == 0)
+        {
+            data[istride[0] * (length[0] / 2)].imag(0.0);
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            data[istride[1] * (length[1] / 2)].imag(0.0);
+        }
+
+        if(length[2] % 2 == 0)
+        {
+            data[istride[2] * (length[2] / 2)].imag(0.0);
+        }
+
+        if(length[0] % 2 == 0 && length[1] % 2 == 0)
+        {
+            data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
+        }
+
+        if(length[0] % 2 == 0 && length[2] % 2 == 0)
+        {
+            data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
+        }
+        if(length[1] % 2 == 0 && length[2] % 2 == 0)
+        {
+            data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
+        }
+
+        if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
+        {
+            data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
+                 + istride[2] * (length[2] / 2)]
+                .imag(0.0);
+        }
+
+        // y-axis:
+        for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+        {
+            data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]);
+        }
+
+        if(length[0] % 2 == 0)
+        {
+            // y-axis at x-nyquist
+            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+            {
+                data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
+                    = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]);
+            }
+        }
+
+        // x-axis:
+        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+        {
+            data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            // x-axis at y-nyquist
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+                    = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
+            }
+        }
+
+        // x-y plane:
+        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+        {
+            for(unsigned int j = 1; j < length[1]; ++j)
+            {
+                data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
+                    = std::conj(data[istride[0] * i + istride[1] * j]);
+            }
+        }
+
+        if(length[2] % 2 == 0)
+        {
+            // x-axis at z-nyquist
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+                    = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
+            }
+            if(length[1] % 2 == 0)
+            {
+                // x-axis at yz-nyquist
+                for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+                {
+                    data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+                        = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
+                }
+            }
+
+            // y-axis: at z-nyquist
+            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+            {
+                data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
+                    = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]);
+            }
+
+            if(length[0] % 2 == 0)
+            {
+                // y-axis: at xz-nyquist
+                for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+                {
+                    data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
+                         + istride[2] * (length[2] / 2)]
+                        = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j
+                                         + istride[2] * (length[2] / 2)]);
+                }
+            }
+
+            // x-y plane: at z-nyquist
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                for(unsigned int j = 1; j < length[1]; ++j)
+                {
+                    data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
+                         + istride[2] * (length[2] / 2)]
+                        = std::conj(
+                            data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]);
+                }
+            }
+        }
+    }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar_3D(std::vector<hostbuf>&     vals,
+                                                const std::vector<Tsize>& length,
+                                                const std::vector<Tsize>& istride,
+                                                const Tsize               idist,
+                                                const Tsize               nbatch)
+{
+    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
+    {
+        auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
+        auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
+
+        data_imag[0] = 0.0;
+
+        if(length[0] % 2 == 0)
+        {
+            data_imag[istride[0] * (length[0] / 2)] = 0.0;
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            data_imag[istride[1] * (length[1] / 2)] = 0.0;
+        }
+
+        if(length[2] % 2 == 0)
+        {
+            data_imag[istride[2] * (length[2] / 2)] = 0.0;
+        }
+
+        if(length[0] % 2 == 0 && length[1] % 2 == 0)
+        {
+            data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
+        }
+
+        if(length[0] % 2 == 0 && length[2] % 2 == 0)
+        {
+            data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0;
+        }
+        if(length[1] % 2 == 0 && length[2] % 2 == 0)
+        {
+            data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0;
+        }
+
+        if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
+        {
+            data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
+                      + istride[2] * (length[2] / 2)]
+                = 0.0;
+        }
+
+        // y-axis:
+        for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+        {
+            data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j];
+            data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j];
+        }
+
+        if(length[0] % 2 == 0)
+        {
+            // y-axis at x-nyquist
+            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+            {
+                data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
+                    = data_real[istride[0] * (length[0] / 2) + istride[1] * j];
+                data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
+                    = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j];
+            }
+        }
+
+        // x-axis:
+        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+        {
+            data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
+            data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
+        }
+
+        if(length[1] % 2 == 0)
+        {
+            // x-axis at y-nyquist
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+                    = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
+                data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
+                    = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
+            }
+        }
+
+        // x-y plane:
+        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+        {
+            for(unsigned int j = 1; j < length[1]; ++j)
+            {
+                data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
+                    = data_real[istride[0] * i + istride[1] * j];
+                data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
+                    = -data_imag[istride[0] * i + istride[1] * j];
+            }
+        }
+
+        if(length[2] % 2 == 0)
+        {
+            // x-axis at z-nyquist
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+                    = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
+                data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+                    = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
+            }
+            if(length[1] % 2 == 0)
+            {
+                // x-axis at yz-nyquist
+                for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+                {
+                    data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+                        = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
+                    data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
+                        = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
+                }
+            }
+
+            // y-axis: at z-nyquist
+            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+            {
+                data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
+                    = data_real[istride[1] * j + istride[2] * (length[2] / 2)];
+                data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
+                    = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)];
+            }
+
+            if(length[0] % 2 == 0)
+            {
+                // y-axis: at xz-nyquist
+                for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
+                {
+                    data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
+                              + istride[2] * (length[2] / 2)]
+                        = data_real[istride[0] * (length[0] / 2) + istride[1] * j
+                                    + istride[2] * (length[2] / 2)];
+                    data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
+                              + istride[2] * (length[2] / 2)]
+                        = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j
+                                     + istride[2] * (length[2] / 2)];
+                }
+            }
+
+            // x-y plane: at z-nyquist
+            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
+            {
+                for(unsigned int j = 1; j < length[1]; ++j)
+                {
+                    data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
+                              + istride[2] * (length[2] / 2)]
+                        = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)];
+                    data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
+                              + istride[2] * (length[2] / 2)]
+                        = -data_imag[istride[0] * i + istride[1] * j
+                                     + istride[2] * (length[2] / 2)];
+                }
+            }
+        }
+    }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_random_interleaved_data(std::vector<hostbuf>& input,
+                                             const Tint1&          whole_length,
+                                             const Tint1&          whole_stride,
+                                             const size_t          idist,
+                                             const size_t          nbatch)
+{
+    auto   idata      = (std::complex<Tfloat>*)input[0].data();
+    size_t i_base     = 0;
+    auto   partitions = partition_rowmajor(whole_length);
+    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+    {
+#pragma omp parallel for num_threads(partitions.size())
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto         index  = partitions[part].first;
+            const auto   length = partitions[part].second;
+            std::mt19937 gen(compute_index(index, whole_stride, i_base));
+            do
+            {
+                const auto                 i = compute_index(index, whole_stride, i_base);
+                const Tfloat               x = (Tfloat)gen() / (Tfloat)gen.max();
+                const Tfloat               y = (Tfloat)gen() / (Tfloat)gen.max();
+                const std::complex<Tfloat> val(x, y);
+                idata[i] = val;
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_interleaved_data(std::vector<hostbuf>& input,
+                                      const Tint1&          whole_length,
+                                      const Tint1&          whole_stride,
+                                      const size_t          idist,
+                                      const size_t          nbatch)
+{
+    auto   idata       = (std::complex<Tfloat>*)input[0].data();
+    size_t i_base      = 0;
+    auto   partitions  = partition_rowmajor(whole_length);
+    auto   unit_stride = make_unit_stride(whole_length);
+
+    const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
+
+    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+    {
+#pragma omp parallel for num_threads(partitions.size())
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto val_xy
+                    = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
+
+                const std::complex<Tfloat> val(val_xy, val_xy);
+
+                const auto i = compute_index(index, whole_stride, i_base);
+
+                idata[i] = val;
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_random_planar_data(std::vector<hostbuf>& input,
+                                        const Tint1&          whole_length,
+                                        const Tint1&          whole_stride,
+                                        const size_t          idist,
+                                        const size_t          nbatch)
+{
+    auto   ireal      = (Tfloat*)input[0].data();
+    auto   iimag      = (Tfloat*)input[1].data();
+    size_t i_base     = 0;
+    auto   partitions = partition_rowmajor(whole_length);
+    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+    {
+#pragma omp parallel for num_threads(partitions.size())
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto         index  = partitions[part].first;
+            const auto   length = partitions[part].second;
+            std::mt19937 gen(compute_index(index, whole_stride, i_base));
+            do
+            {
+                const auto                 i = compute_index(index, whole_stride, i_base);
+                const std::complex<Tfloat> val((Tfloat)gen() / (Tfloat)gen.max(),
+                                               (Tfloat)gen() / (Tfloat)gen.max());
+                ireal[i] = val.real();
+                iimag[i] = val.imag();
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_planar_data(std::vector<hostbuf>& input,
+                                 const Tint1&          whole_length,
+                                 const Tint1&          whole_stride,
+                                 const size_t          idist,
+                                 const size_t          nbatch)
+{
+
+    auto   ireal       = (Tfloat*)input[0].data();
+    auto   iimag       = (Tfloat*)input[1].data();
+    size_t i_base      = 0;
+    auto   partitions  = partition_rowmajor(whole_length);
+    auto   unit_stride = make_unit_stride(whole_length);
+
+    const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
+
+    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+    {
+#pragma omp parallel for num_threads(partitions.size())
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto val_xy
+                    = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
+
+                const auto i = compute_index(index, whole_stride, i_base);
+
+                ireal[i] = val_xy;
+                iimag[i] = val_xy;
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_random_real_data(std::vector<hostbuf>& input,
+                                      const Tint1&          whole_length,
+                                      const Tint1&          whole_stride,
+                                      const size_t          idist,
+                                      const size_t          nbatch)
+{
+    auto   idata      = (Tfloat*)input[0].data();
+    size_t i_base     = 0;
+    auto   partitions = partition_rowmajor(whole_length);
+    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+    {
+#pragma omp parallel for num_threads(partitions.size())
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto         index  = partitions[part].first;
+            const auto   length = partitions[part].second;
+            std::mt19937 gen(compute_index(index, whole_stride, i_base));
+            do
+            {
+                const auto   i   = compute_index(index, whole_stride, i_base);
+                const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max();
+                idata[i]         = val;
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+template <typename Tfloat, typename Tint1>
+static void generate_real_data(std::vector<hostbuf>& input,
+                               const Tint1&          whole_length,
+                               const Tint1&          whole_stride,
+                               const size_t          idist,
+                               const size_t          nbatch)
+{
+
+    auto   idata       = (Tfloat*)input[0].data();
+    size_t i_base      = 0;
+    auto   partitions  = partition_rowmajor(whole_length);
+    auto   unit_stride = make_unit_stride(whole_length);
+
+    const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
+
+    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
+    {
+#pragma omp parallel for num_threads(partitions.size())
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto i = compute_index(index, whole_stride, i_base);
+
+                idata[i]
+                    = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_interleaved(std::vector<hostbuf>&     vals,
+                                                  const std::vector<Tsize>& length,
+                                                  const std::vector<Tsize>& istride,
+                                                  const Tsize               idist,
+                                                  const Tsize               nbatch)
+{
+    switch(length.size())
+    {
+    case 1:
+        impose_hermitian_symmetry_interleaved_1D<Tfloat>(vals, length, istride, idist, nbatch);
+        break;
+    case 2:
+        impose_hermitian_symmetry_interleaved_2D<Tfloat>(vals, length, istride, idist, nbatch);
+        break;
+    case 3:
+        impose_hermitian_symmetry_interleaved_3D<Tfloat>(vals, length, istride, idist, nbatch);
+        break;
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+}
+
+template <typename Tfloat, typename Tsize>
+static void impose_hermitian_symmetry_planar(std::vector<hostbuf>&     vals,
+                                             const std::vector<Tsize>& length,
+                                             const std::vector<Tsize>& istride,
+                                             const Tsize               idist,
+                                             const Tsize               nbatch)
+{
+    switch(length.size())
+    {
+    case 1:
+        impose_hermitian_symmetry_planar_1D<Tfloat>(vals, length, istride, idist, nbatch);
+        break;
+    case 2:
+        impose_hermitian_symmetry_planar_2D<Tfloat>(vals, length, istride, idist, nbatch);
+        break;
+    case 3:
+        impose_hermitian_symmetry_planar_3D<Tfloat>(vals, length, istride, idist, nbatch);
+        break;
+    default:
+        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
+    }
+}
+
+#endif // DATA_GEN_HOST_H
diff --git a/shared/device_properties.h b/shared/device_properties.h
new file mode 100644
index 0000000..6e2e1e1
--- /dev/null
+++ b/shared/device_properties.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_DEVICE_PROPS_H
+#define ROCFFT_DEVICE_PROPS_H
+
+#include <cstdint>
+#include <hip/hip_runtime_api.h>
+#include <stdexcept>
+
+// get device properties
+static hipDeviceProp_t get_curr_device_prop()
+{
+    hipDeviceProp_t prop;
+    int             deviceId = 0;
+    if(hipGetDevice(&deviceId) != hipSuccess)
+        throw std::runtime_error("hipGetDevice failed.");
+
+    if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
+        throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
+                                 + std::to_string(deviceId));
+
+    return prop;
+}
+
+// check that the given grid/block dims will fit into the limits in
+// the device properties.  throws std::runtime_error if the limits
+// are exceeded.
+static void launch_limits_check(const std::string&     kernel_name,
+                                const dim3             gridDim,
+                                const dim3             blockDim,
+                                const hipDeviceProp_t& deviceProp)
+{
+    // Need lots of casting here because dim3 is unsigned but device
+    // props are signed.  Cast direct comparisons to fix signedness
+    // issues.  Promote types to 64-bit when multiplying to try to
+    // avoid overflow.
+
+    // Block limits along each dimension
+    if(blockDim.x > static_cast<uint32_t>(deviceProp.maxThreadsDim[0])
+       || blockDim.y > static_cast<uint32_t>(deviceProp.maxThreadsDim[1])
+       || blockDim.z > static_cast<uint32_t>(deviceProp.maxThreadsDim[2]))
+        throw std::runtime_error("max threads per dim exceeded: " + kernel_name);
+
+    // Total threads for the whole block
+    if(static_cast<uint64_t>(blockDim.x) * blockDim.y * blockDim.z
+       > static_cast<uint64_t>(deviceProp.maxThreadsPerBlock))
+        throw std::runtime_error("max threads per block exceeded: " + kernel_name);
+
+    // Grid dimension limits
+    if(gridDim.x > static_cast<uint32_t>(deviceProp.maxGridSize[0])
+       || gridDim.y > static_cast<uint32_t>(deviceProp.maxGridSize[1])
+       || gridDim.z > static_cast<uint32_t>(deviceProp.maxGridSize[2]))
+        throw std::runtime_error("max grid size exceeded: " + kernel_name);
+}
+
+#endif
diff --git a/shared/enum_to_string.h b/shared/enum_to_string.h
new file mode 100644
index 0000000..1c2fba0
--- /dev/null
+++ b/shared/enum_to_string.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ENUM_TO_STRING_H
+#define ENUM_TO_STRING_H
+
+#include "fft_params.h"
+
+// Return the string of the hipError code.
+static std::string hipError_to_string(const hipError_t ret)
+{
+    switch(ret)
+    {
+    case hipSuccess:
+        return "hipSuccess";
+    case hipErrorInvalidContext:
+        return "hipErrorInvalidContext";
+    case hipErrorInvalidKernelFile:
+        return "hipErrorInvalidKernelFile";
+    case hipErrorMemoryAllocation:
+        return "hipErrorMemoryAllocation";
+    case hipErrorInitializationError:
+        return "hipErrorInitializationError";
+    case hipErrorLaunchFailure:
+        return "hipErrorLaunchFailure";
+    case hipErrorLaunchOutOfResources:
+        return "hipErrorLaunchOutOfResources";
+    case hipErrorInvalidDevice:
+        return "hipErrorInvalidDevice";
+    case hipErrorInvalidValue:
+        return "hipErrorInvalidValue";
+    case hipErrorInvalidDevicePointer:
+        return "hipErrorInvalidDevicePointer";
+    case hipErrorInvalidMemcpyDirection:
+        return "hipErrorInvalidMemcpyDirection";
+    case hipErrorUnknown:
+        return "hipErrorUnknown";
+    case hipErrorInvalidResourceHandle:
+        return "hipErrorInvalidResourceHandle";
+    case hipErrorNotReady:
+        return "hipErrorNotReady";
+    case hipErrorNoDevice:
+        return "hipErrorNoDevice";
+    case hipErrorPeerAccessAlreadyEnabled:
+        return "hipErrorPeerAccessAlreadyEnabled";
+    case hipErrorPeerAccessNotEnabled:
+        return "hipErrorPeerAccessNotEnabled";
+    case hipErrorRuntimeMemory:
+        return "hipErrorRuntimeMemory";
+    case hipErrorRuntimeOther:
+        return "hipErrorRuntimeOther";
+    case hipErrorHostMemoryAlreadyRegistered:
+        return "hipErrorHostMemoryAlreadyRegistered";
+    case hipErrorHostMemoryNotRegistered:
+        return "hipErrorHostMemoryNotRegistered";
+    case hipErrorMapBufferObjectFailed:
+        return "hipErrorMapBufferObjectFailed";
+    case hipErrorTbd:
+        return "hipErrorTbd";
+    default:
+        throw std::runtime_error("unknown hipError");
+    }
+}
+#endif
diff --git a/shared/environment.h b/shared/environment.h
new file mode 100644
index 0000000..7be56a0
--- /dev/null
+++ b/shared/environment.h
@@ -0,0 +1,97 @@
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// wrappers around environment variable routines
+
+#pragma once
+
+#include <string>
+
+// Windows provides "getenv" and "_putenv", but those modify the
+// runtime's copy of the environment.  The actual environment in the
+// process control block is accessed using GetEnvironmentVariable and
+// SetEnvironmentVariable.
+
+#ifdef WIN32
+#include <windows.h>
+static void rocfft_setenv(const char* var, const char* value)
+{
+    SetEnvironmentVariable(var, value);
+}
+static void rocfft_unsetenv(const char* var)
+{
+    SetEnvironmentVariable(var, nullptr);
+}
+static std::string rocfft_getenv(const char* var)
+{
+    DWORD       size = GetEnvironmentVariable(var, nullptr, 0);
+    std::string ret;
+    if(size)
+    {
+        ret.resize(size);
+        GetEnvironmentVariable(var, ret.data(), size);
+        // GetEnvironmentVariable counts the terminating null, so remove it
+        while(!ret.empty() && ret.back() == 0)
+            ret.pop_back();
+    }
+    return ret;
+}
+
+#else
+
+#include <stdlib.h>
+
+static void rocfft_setenv(const char* var, const char* value)
+{
+    setenv(var, value, 1);
+}
+static void rocfft_unsetenv(const char* var)
+{
+    unsetenv(var);
+}
+static std::string rocfft_getenv(const char* var)
+{
+    auto value = getenv(var);
+    return value ? value : "";
+}
+#endif
+
+// RAII object to set an environment variable and restore it to its
+// previous value on destruction
+struct EnvironmentSetTemp
+{
+    EnvironmentSetTemp(const char* _var, const char* val)
+        : var(_var)
+    {
+        auto val_ptr = rocfft_getenv(_var);
+        if(!val_ptr.empty())
+            oldvalue = val_ptr;
+        rocfft_setenv(_var, val);
+    }
+    ~EnvironmentSetTemp()
+    {
+        if(oldvalue.empty())
+            rocfft_unsetenv(var.c_str());
+        else
+            rocfft_setenv(var.c_str(), oldvalue.c_str());
+    }
+    std::string var;
+    std::string oldvalue;
+};
diff --git a/shared/fft_params.h b/shared/fft_params.h
new file mode 100644
index 0000000..bf428ef
--- /dev/null
+++ b/shared/fft_params.h
@@ -0,0 +1,3274 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFT_PARAMS_H
+#define FFT_PARAMS_H
+
+#include <algorithm>
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <random>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+#include "../shared/arithmetic.h"
+#include "../shared/array_validator.h"
+#include "../shared/data_gen_device.h"
+#include "../shared/data_gen_host.h"
+#include "../shared/device_properties.h"
+#include "../shared/printbuffer.h"
+#include "../shared/ptrdiff.h"
+
+enum fft_status
+{
+    fft_status_success,
+    fft_status_failure,
+    fft_status_invalid_arg_value,
+    fft_status_invalid_dimensions,
+    fft_status_invalid_array_type,
+    fft_status_invalid_strides,
+    fft_status_invalid_distance,
+    fft_status_invalid_offset,
+    fft_status_invalid_work_buffer,
+};
+
+enum fft_transform_type
+{
+    fft_transform_type_complex_forward,
+    fft_transform_type_complex_inverse,
+    fft_transform_type_real_forward,
+    fft_transform_type_real_inverse,
+};
+
+enum fft_precision
+{
+    fft_precision_half,
+    fft_precision_single,
+    fft_precision_double,
+};
+
+static std::istream& operator>>(std::istream& str, fft_precision& precision)
+{
+    std::string word;
+    str >> word;
+
+    if(word == "half")
+        precision = fft_precision_half;
+    else if(word == "single")
+        precision = fft_precision_single;
+    else if(word == "double")
+        precision = fft_precision_double;
+    else
+        throw std::runtime_error("Invalid precision specified");
+    return str;
+}
+
+// fft_input_generator: linearly spaced sequence in [-0.5,0.5]
+// fft_input_random_generator: pseudo-random sequence in [-0.5,0.5]
+enum fft_input_generator
+{
+    fft_input_random_generator_device,
+    fft_input_random_generator_host,
+    fft_input_generator_device,
+    fft_input_generator_host,
+};
+
+static std::istream& operator>>(std::istream& str, fft_input_generator& gen)
+{
+    std::string word;
+    str >> word;
+
+    if(word == "0")
+        gen = fft_input_random_generator_device;
+    else if(word == "1")
+        gen = fft_input_random_generator_host;
+    else if(word == "2")
+        gen = fft_input_generator_device;
+    else if(word == "3")
+        gen = fft_input_generator_host;
+    else
+        throw std::runtime_error("Invalid input generator specified");
+    return str;
+}
+
+enum fft_array_type
+{
+    fft_array_type_complex_interleaved,
+    fft_array_type_complex_planar,
+    fft_array_type_real,
+    fft_array_type_hermitian_interleaved,
+    fft_array_type_hermitian_planar,
+    fft_array_type_unset,
+};
+
+enum fft_result_placement
+{
+    fft_placement_inplace,
+    fft_placement_notinplace,
+};
+
+// Determine the size of the data type given the precision and type.
+template <typename Tsize>
+inline Tsize var_size(const fft_precision precision, const fft_array_type type)
+{
+    size_t var_size = 0;
+    switch(precision)
+    {
+    case fft_precision_half:
+        var_size = sizeof(_Float16);
+        break;
+    case fft_precision_single:
+        var_size = sizeof(float);
+        break;
+    case fft_precision_double:
+        var_size = sizeof(double);
+        break;
+    }
+    switch(type)
+    {
+    case fft_array_type_complex_interleaved:
+    case fft_array_type_hermitian_interleaved:
+        var_size *= 2;
+        break;
+    default:
+        break;
+    }
+    return var_size;
+}
+// Given an array type and transform length, strides, etc, load random floats in [0,1]
+// into the input array of floats/doubles or complex floats/doubles gpu buffers.
+template <typename Tfloat, typename Tint1>
+inline void set_input(std::vector<gpubuf>&       input,
+                      const fft_input_generator  igen,
+                      const fft_array_type       itype,
+                      const std::vector<size_t>& length,
+                      const std::vector<size_t>& ilength,
+                      const std::vector<size_t>& istride,
+                      const Tint1&               whole_length,
+                      const Tint1&               whole_stride,
+                      const size_t               idist,
+                      const size_t               nbatch,
+                      const hipDeviceProp_t&     deviceProp)
+{
+    auto isize = count_iters(whole_length) * nbatch;
+
+    switch(itype)
+    {
+    case fft_array_type_complex_interleaved:
+    case fft_array_type_hermitian_interleaved:
+    {
+        auto ibuffer = (rocfft_complex<Tfloat>*)input[0].data();
+
+        if(igen == fft_input_generator_device)
+            generate_interleaved_data(
+                whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
+        else if(igen == fft_input_random_generator_device)
+            generate_random_interleaved_data(
+                whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
+
+        if(itype == fft_array_type_hermitian_interleaved)
+        {
+            auto ibuffer_2 = (rocfft_complex<Tfloat>*)input[0].data();
+            impose_hermitian_symmetry_interleaved(
+                length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp);
+        }
+
+        break;
+    }
+    case fft_array_type_complex_planar:
+    case fft_array_type_hermitian_planar:
+    {
+        auto ibuffer_real = (Tfloat*)input[0].data();
+        auto ibuffer_imag = (Tfloat*)input[1].data();
+
+        if(igen == fft_input_generator_device)
+            generate_planar_data(whole_length,
+                                 idist,
+                                 isize,
+                                 whole_stride,
+                                 nbatch,
+                                 ibuffer_real,
+                                 ibuffer_imag,
+                                 deviceProp);
+        else if(igen == fft_input_random_generator_device)
+            generate_random_planar_data(
+                whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp);
+
+        if(itype == fft_array_type_hermitian_planar)
+            impose_hermitian_symmetry_planar(
+                length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp);
+
+        break;
+    }
+    case fft_array_type_real:
+    {
+        auto ibuffer = (Tfloat*)input[0].data();
+
+        if(igen == fft_input_generator_device)
+            generate_real_data(
+                whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
+        else if(igen == fft_input_random_generator_device)
+            generate_random_real_data(
+                whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
+
+        break;
+    }
+    default:
+        throw std::runtime_error("Input layout format not yet supported");
+    }
+}
+
+template <typename Tfloat, typename Tint1>
+inline void set_input(std::vector<hostbuf>&      input,
+                      const fft_input_generator  igen,
+                      const fft_array_type       itype,
+                      const std::vector<size_t>& length,
+                      const std::vector<size_t>& ilength,
+                      const std::vector<size_t>& istride,
+                      const Tint1&               whole_length,
+                      const Tint1&               whole_stride,
+                      const size_t               idist,
+                      const size_t               nbatch,
+                      const hipDeviceProp_t&     deviceProp)
+{
+    switch(itype)
+    {
+    case fft_array_type_complex_interleaved:
+    case fft_array_type_hermitian_interleaved:
+    {
+        if(igen == fft_input_generator_host)
+            generate_interleaved_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+        else if(igen == fft_input_random_generator_host)
+            generate_random_interleaved_data<Tfloat>(
+                input, whole_length, whole_stride, idist, nbatch);
+
+        if(itype == fft_array_type_hermitian_interleaved)
+            impose_hermitian_symmetry_interleaved<Tfloat>(input, length, istride, idist, nbatch);
+
+        break;
+    }
+    case fft_array_type_complex_planar:
+    case fft_array_type_hermitian_planar:
+    {
+        if(igen == fft_input_generator_host)
+            generate_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+        else if(igen == fft_input_random_generator_host)
+            generate_random_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+
+        if(itype == fft_array_type_hermitian_planar)
+            impose_hermitian_symmetry_planar<Tfloat>(input, length, istride, idist, nbatch);
+
+        break;
+    }
+    case fft_array_type_real:
+    {
+        if(igen == fft_input_generator_host)
+            generate_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+        else if(igen == fft_input_random_generator_host)
+            generate_random_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
+
+        break;
+    }
+    default:
+        throw std::runtime_error("Input layout format not yet supported");
+    }
+}
+
+// unroll set_input for dimension 1, 2, 3
+template <typename Tbuff, typename Tfloat>
+inline void set_input(std::vector<Tbuff>&        input,
+                      const fft_input_generator  igen,
+                      const fft_array_type       itype,
+                      const std::vector<size_t>& length,
+                      const std::vector<size_t>& ilength,
+                      const std::vector<size_t>& istride,
+                      const size_t               idist,
+                      const size_t               nbatch,
+                      const hipDeviceProp_t&     deviceProp)
+{
+    switch(length.size())
+    {
+    case 1:
+        set_input<Tfloat>(input,
+                          igen,
+                          itype,
+                          length,
+                          ilength,
+                          istride,
+                          ilength[0],
+                          istride[0],
+                          idist,
+                          nbatch,
+                          deviceProp);
+        break;
+    case 2:
+        set_input<Tfloat>(input,
+                          igen,
+                          itype,
+                          length,
+                          ilength,
+                          istride,
+                          std::make_tuple(ilength[0], ilength[1]),
+                          std::make_tuple(istride[0], istride[1]),
+                          idist,
+                          nbatch,
+                          deviceProp);
+        break;
+    case 3:
+        set_input<Tfloat>(input,
+                          igen,
+                          itype,
+                          length,
+                          ilength,
+                          istride,
+                          std::make_tuple(ilength[0], ilength[1], ilength[2]),
+                          std::make_tuple(istride[0], istride[1], istride[2]),
+                          idist,
+                          nbatch,
+                          deviceProp);
+        break;
+    default:
+        abort();
+    }
+}
+
+// Container class for test parameters.
+class fft_params
+{
+public:
+    // All parameters are row-major.
+    std::vector<size_t>  length;
+    std::vector<size_t>  istride;
+    std::vector<size_t>  ostride;
+    size_t               nbatch         = 1;
+    fft_precision        precision      = fft_precision_single;
+    fft_input_generator  igen           = fft_input_random_generator_device;
+    fft_transform_type   transform_type = fft_transform_type_complex_forward;
+    fft_result_placement placement      = fft_placement_inplace;
+    size_t               idist          = 0;
+    size_t               odist          = 0;
+    fft_array_type       itype          = fft_array_type_unset;
+    fft_array_type       otype          = fft_array_type_unset;
+    std::vector<size_t>  ioffset        = {0, 0};
+    std::vector<size_t>  ooffset        = {0, 0};
+
+    std::vector<size_t> isize;
+    std::vector<size_t> osize;
+
+    size_t workbuffersize = 0;
+
+    struct fft_brick
+    {
+        // all vectors here are row-major, with same length as FFT
+        // dimension + 1 (for batch dimension)
+
+        // inclusive lower bound of brick
+        std::vector<size_t> lower;
+        // exclusive upper bound of brick
+        std::vector<size_t> upper;
+        // stride of brick in memory
+        std::vector<size_t> stride;
+
+        // compute the length of this brick
+        std::vector<size_t> length() const
+        {
+            std::vector<size_t> ret;
+            for(size_t i = 0; i < lower.size(); ++i)
+                ret.push_back(upper[i] - lower[i]);
+            return ret;
+        }
+
+        // compute offset of lower bound in a field with the given
+        // stride + dist (batch stride is separate)
+        size_t lower_field_offset(std::vector<size_t> stride, size_t dist) const
+        {
+            // brick strides include batch, so adjust our input accordingly
+            stride.insert(stride.begin(), dist);
+
+            return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0);
+        }
+
+        // location of the brick
+        int device = 0;
+    };
+
+    struct fft_field
+    {
+        std::vector<fft_brick> bricks;
+    };
+    // optional brick decomposition of inputs/outputs
+    std::vector<fft_field> ifields;
+    std::vector<fft_field> ofields;
+
+    // run testing load/store callbacks
+    bool                    run_callbacks   = false;
+    static constexpr double load_cb_scalar  = 0.457813941;
+    static constexpr double store_cb_scalar = 0.391504938;
+
+    // Check that data outside of output strides is not overwritten.
+    // This is only set explicitly on some tests where there's space
+    // between dimensions, but the dimensions are still in-order.
+    // We're not trying to generically find holes in arbitrary data
+    // layouts.
+    //
+    // NOTE: this flag is not included in tokens, since it doesn't
+    // affect how the FFT library behaves.
+    bool check_output_strides = false;
+
+    // scaling factor - we do a pointwise multiplication of outputs by
+    // this factor
+    double scale_factor = 1.0;
+
+    fft_params(){};
+    virtual ~fft_params(){};
+
+    // Given an array type, return the name as a string.
+    static std::string array_type_name(const fft_array_type type, bool verbose = true)
+    {
+        switch(type)
+        {
+        case fft_array_type_complex_interleaved:
+            return verbose ? "fft_array_type_complex_interleaved" : "CI";
+        case fft_array_type_complex_planar:
+            return verbose ? "fft_array_type_complex_planar" : "CP";
+        case fft_array_type_real:
+            return verbose ? "fft_array_type_real" : "R";
+        case fft_array_type_hermitian_interleaved:
+            return verbose ? "fft_array_type_hermitian_interleaved" : "HI";
+        case fft_array_type_hermitian_planar:
+            return verbose ? "fft_array_type_hermitian_planar" : "HP";
+        case fft_array_type_unset:
+            return verbose ? "fft_array_type_unset" : "UN";
+        }
+        return "";
+    }
+
+    std::string transform_type_name() const
+    {
+        switch(transform_type)
+        {
+        case fft_transform_type_complex_forward:
+            return "fft_transform_type_complex_forward";
+        case fft_transform_type_complex_inverse:
+            return "fft_transform_type_complex_inverse";
+        case fft_transform_type_real_forward:
+            return "fft_transform_type_real_forward";
+        case fft_transform_type_real_inverse:
+            return "fft_transform_type_real_inverse";
+        default:
+            throw std::runtime_error("Invalid transform type");
+        }
+    }
+
+    // Convert to string for output.
+    std::string str(const std::string& separator = ", ") const
+    {
+        // top-level stride/dist are not used when fields are specified.
+        const bool have_ifields = !ifields.empty();
+        const bool have_ofields = !ofields.empty();
+
+        std::stringstream ss;
+        auto print_size_vec = [&](const char* description, const std::vector<size_t>& vec) {
+            ss << description << ":";
+            for(auto i : vec)
+                ss << " " << i;
+            ss << separator;
+        };
+        auto print_fields = [&](const char* description, const std::vector<fft_field>& fields) {
+            for(unsigned int fidx = 0; fidx < fields.size(); ++fidx)
+            {
+                const auto& f = fields[fidx];
+                ss << description << " " << fidx << ":" << separator;
+                for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx)
+                {
+                    const auto& b = f.bricks[bidx];
+                    ss << " brick " << bidx << ":" << separator;
+                    print_size_vec("  lower", b.lower);
+                    print_size_vec("  upper", b.upper);
+                    print_size_vec("  stride", b.stride);
+                    ss << "  device: " << b.device << separator;
+                }
+            }
+        };
+
+        print_size_vec("length", length);
+        if(have_ifields)
+        {
+            print_fields("ifield", ifields);
+        }
+        else
+        {
+            print_size_vec("istride", istride);
+            ss << "idist: " << idist << separator;
+        }
+
+        if(have_ofields)
+        {
+            print_fields("ofield", ofields);
+        }
+        else
+        {
+            print_size_vec("ostride", ostride);
+            ss << "odist: " << odist << separator;
+        }
+
+        ss << "batch: " << nbatch << separator;
+        print_size_vec("isize", isize);
+        print_size_vec("osize", osize);
+
+        print_size_vec("ioffset", ioffset);
+        print_size_vec("ooffset", ooffset);
+
+        if(placement == fft_placement_inplace)
+            ss << "in-place";
+        else
+            ss << "out-of-place";
+        ss << separator;
+        ss << "transform_type: " << transform_type_name() << separator;
+        ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator;
+        switch(precision)
+        {
+        case fft_precision_half:
+            ss << "half-precision";
+            break;
+        case fft_precision_single:
+            ss << "single-precision";
+            break;
+        case fft_precision_double:
+            ss << "double-precision";
+            break;
+        }
+        ss << separator;
+
+        print_size_vec("ilength", ilength());
+        print_size_vec("olength", olength());
+
+        print_size_vec("ibuffer_size", ibuffer_sizes());
+        print_size_vec("obuffer_size", obuffer_sizes());
+
+        if(scale_factor != 1.0)
+            ss << "scale factor: " << scale_factor << separator;
+
+        return ss.str();
+    }
+
+    // Produce a stringified token of the test fft params.
+    std::string token() const
+    {
+        std::string ret;
+
+        switch(transform_type)
+        {
+        case fft_transform_type_complex_forward:
+            ret += "complex_forward_";
+            break;
+        case fft_transform_type_complex_inverse:
+            ret += "complex_inverse_";
+            break;
+        case fft_transform_type_real_forward:
+            ret += "real_forward_";
+            break;
+        case fft_transform_type_real_inverse:
+            ret += "real_inverse_";
+            break;
+        }
+
+        auto append_size_vec = [&ret](const std::vector<size_t>& vec) {
+            for(auto s : vec)
+            {
+                ret += "_";
+                ret += std::to_string(s);
+            }
+        };
+
+        ret += "len";
+        append_size_vec(length);
+
+        switch(precision)
+        {
+        case fft_precision_half:
+            ret += "_half_";
+            break;
+        case fft_precision_single:
+            ret += "_single_";
+            break;
+        case fft_precision_double:
+            ret += "_double_";
+            break;
+        }
+
+        switch(placement)
+        {
+        case fft_placement_inplace:
+            ret += "ip_";
+            break;
+        case fft_placement_notinplace:
+            ret += "op_";
+            break;
+        }
+
+        ret += "batch_";
+        ret += std::to_string(nbatch);
+
+        auto append_array_type = [&ret](fft_array_type type) {
+            switch(type)
+            {
+            case fft_array_type_complex_interleaved:
+                ret += "CI";
+                break;
+            case fft_array_type_complex_planar:
+                ret += "CP";
+                break;
+            case fft_array_type_real:
+                ret += "R";
+                break;
+            case fft_array_type_hermitian_interleaved:
+                ret += "HI";
+                break;
+            case fft_array_type_hermitian_planar:
+                ret += "HP";
+                break;
+            default:
+                ret += "UN";
+                break;
+            }
+        };
+
+        auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) {
+            ret += "_brick";
+
+            ret += "_lower";
+            append_size_vec(b.lower);
+            ret += "_upper";
+            append_size_vec(b.upper);
+            ret += "_stride";
+            append_size_vec(b.stride);
+            ret += "_dev_";
+            ret += std::to_string(b.device);
+        };
+
+        const bool have_ifields = !ifields.empty();
+        const bool have_ofields = !ofields.empty();
+
+        if(have_ifields)
+        {
+            for(const auto& f : ifields)
+            {
+                ret += "_ifield";
+                for(const auto& b : f.bricks)
+                    append_brick_info(b);
+            }
+        }
+        else
+        {
+            ret += "_istride";
+            append_size_vec(istride);
+            ret += "_";
+            append_array_type(itype);
+        }
+
+        if(have_ofields)
+        {
+            for(const auto& f : ofields)
+            {
+                ret += "_ofield";
+                for(const auto& b : f.bricks)
+                    append_brick_info(b);
+            }
+        }
+        else
+        {
+            ret += "_ostride";
+            append_size_vec(ostride);
+            ret += "_";
+            append_array_type(otype);
+        }
+
+        if(!have_ifields)
+        {
+            ret += "_idist_";
+            ret += std::to_string(idist);
+        }
+        if(!have_ofields)
+        {
+            ret += "_odist_";
+            ret += std::to_string(odist);
+        }
+
+        if(!have_ifields)
+        {
+            ret += "_ioffset";
+            append_size_vec(ioffset);
+        }
+
+        if(!have_ofields)
+        {
+            ret += "_ooffset";
+            append_size_vec(ooffset);
+        }
+
+        if(run_callbacks)
+            ret += "_CB";
+
+        if(scale_factor != 1.0)
+            ret += "_scale";
+
+        return ret;
+    }
+
+    // Set all params from a stringified token.
+    void from_token(std::string token)
+    {
+        std::vector<std::string> vals;
+
+        std::string delimiter = "_";
+        {
+            size_t pos = 0;
+            while((pos = token.find(delimiter)) != std::string::npos)
+            {
+                auto val = token.substr(0, pos);
+                vals.push_back(val);
+                token.erase(0, pos + delimiter.length());
+            }
+            vals.push_back(token);
+        }
+
+        auto size_parser
+            = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
+                  if(vals[pos++] != token)
+                      throw std::runtime_error("Unable to parse token");
+                  return std::stoull(vals[pos++]);
+              };
+
+        auto vector_parser
+            = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
+                  if(vals[pos++] != token)
+                      throw std::runtime_error("Unable to parse token");
+                  std::vector<size_t> vec;
+
+                  while(pos < vals.size())
+                  {
+                      if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit))
+                      {
+                          vec.push_back(std::stoull(vals[pos++]));
+                      }
+                      else
+                      {
+                          break;
+                      }
+                  }
+                  return vec;
+              };
+
+        auto type_parser = [](const std::string& val) {
+            if(val == "CI")
+                return fft_array_type_complex_interleaved;
+            else if(val == "CP")
+                return fft_array_type_complex_planar;
+            else if(val == "R")
+                return fft_array_type_real;
+            else if(val == "HI")
+                return fft_array_type_hermitian_interleaved;
+            else if(val == "HP")
+                return fft_array_type_hermitian_planar;
+            return fft_array_type_unset;
+        };
+
+        auto field_parser = [&vector_parser, &size_parser](const std::vector<std::string>& vals,
+                                                           size_t&                         pos,
+                                                           std::vector<fft_field>&         output) {
+            // skip over ifield/ofield word
+            pos++;
+            fft_field& f = output.emplace_back();
+            while(pos < vals.size() && vals[pos] == "brick")
+            {
+                fft_brick& b = f.bricks.emplace_back();
+                pos++;
+                b.lower  = vector_parser(vals, "lower", pos);
+                b.upper  = vector_parser(vals, "upper", pos);
+                b.stride = vector_parser(vals, "stride", pos);
+                b.device = size_parser(vals, "dev", pos);
+            }
+        };
+
+        size_t pos = 0;
+
+        bool complex = vals[pos++] == "complex";
+        bool forward = vals[pos++] == "forward";
+
+        if(complex && forward)
+            transform_type = fft_transform_type_complex_forward;
+        if(complex && !forward)
+            transform_type = fft_transform_type_complex_inverse;
+        if(!complex && forward)
+            transform_type = fft_transform_type_real_forward;
+        if(!complex && !forward)
+            transform_type = fft_transform_type_real_inverse;
+
+        length = vector_parser(vals, "len", pos);
+
+        if(vals[pos] == "half")
+            precision = fft_precision_half;
+        else if(vals[pos] == "single")
+            precision = fft_precision_single;
+        else if(vals[pos] == "double")
+            precision = fft_precision_double;
+        pos++;
+
+        placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace;
+
+        nbatch = size_parser(vals, "batch", pos);
+
+        // strides, bricks etc are mixed in from here, so just keep
+        // looking at the next token to decide what to do
+        while(pos < vals.size())
+        {
+            const auto& next_token = vals[pos];
+            if(next_token == "istride")
+            {
+                istride = vector_parser(vals, "istride", pos);
+                itype   = type_parser(vals[pos]);
+                pos++;
+            }
+            else if(next_token == "ostride")
+            {
+                ostride = vector_parser(vals, "ostride", pos);
+                otype   = type_parser(vals[pos]);
+                pos++;
+            }
+            else if(next_token == "idist")
+                idist = size_parser(vals, "idist", pos);
+            else if(next_token == "odist")
+                odist = size_parser(vals, "odist", pos);
+            else if(next_token == "ioffset")
+                ioffset = vector_parser(vals, "ioffset", pos);
+            else if(next_token == "ooffset")
+                ooffset = vector_parser(vals, "ooffset", pos);
+            else if(next_token == "ifield")
+                field_parser(vals, pos, ifields);
+            else if(next_token == "ofield")
+                field_parser(vals, pos, ofields);
+            else
+                break;
+        }
+
+        if(pos < vals.size() && vals[pos] == "CB")
+        {
+            run_callbacks = true;
+            ++pos;
+        }
+
+        if(pos < vals.size() && vals[pos] == "scale")
+        {
+            // just pick some factor that's not zero or one
+            scale_factor = 0.1239;
+            ++pos;
+        }
+    }
+
+    // Stream output operator (for gtest, etc).
+    friend std::ostream& operator<<(std::ostream& stream, const fft_params& params)
+    {
+        stream << params.str();
+        return stream;
+    }
+
+    // Dimension of the transform.
+    size_t dim() const
+    {
+        return length.size();
+    }
+
+    virtual std::vector<size_t> ilength() const
+    {
+        auto ilength = length;
+        if(transform_type == fft_transform_type_real_inverse)
+            ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1;
+        return ilength;
+    }
+
+    virtual std::vector<size_t> olength() const
+    {
+        auto olength = length;
+        if(transform_type == fft_transform_type_real_forward)
+            olength[dim() - 1] = olength[dim() - 1] / 2 + 1;
+        return olength;
+    }
+
+    static size_t nbuffer(const fft_array_type type)
+    {
+        switch(type)
+        {
+        case fft_array_type_real:
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+            return 1;
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            return 2;
+        case fft_array_type_unset:
+            return 0;
+        }
+        return 0;
+    }
+
+    // Number of input buffers
+    size_t nibuffer() const
+    {
+        return nbuffer(itype);
+    }
+
+    // Number of output buffers
+    size_t nobuffer() const
+    {
+        return nbuffer(otype);
+    }
+
+    void set_iotypes()
+    {
+        if(itype == fft_array_type_unset)
+        {
+            switch(transform_type)
+            {
+            case fft_transform_type_complex_forward:
+            case fft_transform_type_complex_inverse:
+                itype = fft_array_type_complex_interleaved;
+                break;
+            case fft_transform_type_real_forward:
+                itype = fft_array_type_real;
+                break;
+            case fft_transform_type_real_inverse:
+                itype = fft_array_type_hermitian_interleaved;
+                break;
+            default:
+                throw std::runtime_error("Invalid transform type");
+            }
+        }
+        if(otype == fft_array_type_unset)
+        {
+            switch(transform_type)
+            {
+            case fft_transform_type_complex_forward:
+            case fft_transform_type_complex_inverse:
+                otype = fft_array_type_complex_interleaved;
+                break;
+            case fft_transform_type_real_forward:
+                otype = fft_array_type_hermitian_interleaved;
+                break;
+            case fft_transform_type_real_inverse:
+                otype = fft_array_type_real;
+                break;
+            default:
+                throw std::runtime_error("Invalid transform type");
+            }
+        }
+    }
+
+    // Check that the input and output types are consistent.
+    bool check_iotypes() const
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_interleaved:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+            break;
+        default:
+            throw std::runtime_error("Invalid Input array type format");
+        }
+
+        switch(otype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_interleaved:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+            break;
+        default:
+            throw std::runtime_error("Invalid Input array type format");
+        }
+
+        // Check that format choices are supported
+        if(transform_type != fft_transform_type_real_forward
+           && transform_type != fft_transform_type_real_inverse)
+        {
+            if(placement == fft_placement_inplace && itype != otype)
+            {
+                throw std::runtime_error(
+                    "In-place transforms must have identical input and output types");
+            }
+        }
+
+        bool okformat = true;
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_complex_planar:
+            okformat = (otype == fft_array_type_complex_interleaved
+                        || otype == fft_array_type_complex_planar);
+            break;
+        case fft_array_type_hermitian_interleaved:
+        case fft_array_type_hermitian_planar:
+            okformat = otype == fft_array_type_real;
+            break;
+        case fft_array_type_real:
+            okformat = (otype == fft_array_type_hermitian_interleaved
+                        || otype == fft_array_type_hermitian_planar);
+            break;
+        default:
+            throw std::runtime_error("Invalid Input array type format");
+        }
+
+        return okformat;
+    }
+
+    // Given a length vector, set the rest of the strides.
+    // The optional argument stride0 sets the stride for the contiguous dimension.
+    // The optional rcpadding argument sets the stride correctly for in-place
+    // multi-dimensional real/complex transforms.
+    // Format is row-major.
+    template <typename T1>
+    std::vector<T1> compute_stride(const std::vector<T1>&     length,
+                                   const std::vector<size_t>& stride0   = std::vector<size_t>(),
+                                   const bool                 rcpadding = false) const
+    {
+        std::vector<T1> stride(dim());
+
+        size_t dimoffset = 0;
+
+        if(stride0.size() == 0)
+        {
+            // Set the contiguous stride:
+            stride[dim() - 1] = 1;
+            dimoffset         = 1;
+        }
+        else
+        {
+            // Copy the input values to the end of the stride array:
+            for(size_t i = 0; i < stride0.size(); ++i)
+            {
+                stride[dim() - stride0.size() + i] = stride0[i];
+            }
+        }
+
+        if(stride0.size() < dim())
+        {
+            // Compute any remaining values via recursion.
+            for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;)
+            {
+                auto lengthip1 = length[i + 1];
+                if(rcpadding && i == dim() - 2)
+                {
+                    lengthip1 = 2 * (lengthip1 / 2 + 1);
+                }
+                stride[i] = stride[i + 1] * lengthip1;
+            }
+        }
+
+        return stride;
+    }
+
+    void compute_istride()
+    {
+        istride = compute_stride(ilength(),
+                                 istride,
+                                 placement == fft_placement_inplace
+                                     && transform_type == fft_transform_type_real_forward);
+    }
+
+    void compute_ostride()
+    {
+        ostride = compute_stride(olength(),
+                                 ostride,
+                                 placement == fft_placement_inplace
+                                     && transform_type == fft_transform_type_real_inverse);
+    }
+
+    virtual void compute_isize()
+    {
+        auto   il  = ilength();
+        size_t val = compute_ptrdiff(il, istride, nbatch, idist);
+        isize.resize(nibuffer());
+        for(unsigned int i = 0; i < isize.size(); ++i)
+        {
+            isize[i] = val + ioffset[i];
+        }
+    }
+
+    virtual void compute_osize()
+    {
+        auto   ol  = olength();
+        size_t val = compute_ptrdiff(ol, ostride, nbatch, odist);
+        osize.resize(nobuffer());
+        for(unsigned int i = 0; i < osize.size(); ++i)
+        {
+            osize[i] = val + ooffset[i];
+        }
+    }
+
+    std::vector<size_t> ibuffer_sizes() const
+    {
+        std::vector<size_t> ibuffer_sizes;
+
+        // In-place real-to-complex transforms need to have enough space in the input buffer to
+        // accomadate the output, which is slightly larger.
+        if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward)
+        {
+            return obuffer_sizes();
+        }
+
+        if(isize.empty())
+            return ibuffer_sizes;
+
+        switch(itype)
+        {
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            ibuffer_sizes.resize(2);
+            break;
+        default:
+            ibuffer_sizes.resize(1);
+        }
+        for(unsigned i = 0; i < ibuffer_sizes.size(); i++)
+        {
+            ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype);
+        }
+        return ibuffer_sizes;
+    }
+
+    virtual std::vector<size_t> obuffer_sizes() const
+    {
+        std::vector<size_t> obuffer_sizes;
+
+        if(osize.empty())
+            return obuffer_sizes;
+
+        switch(otype)
+        {
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            obuffer_sizes.resize(2);
+            break;
+        default:
+            obuffer_sizes.resize(1);
+        }
+        for(unsigned i = 0; i < obuffer_sizes.size(); i++)
+        {
+            obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype);
+        }
+        return obuffer_sizes;
+    }
+
+    // Compute the idist for a given transform based on the placeness, transform type, and data
+    // layout.
+    size_t compute_idist() const
+    {
+        size_t dist = 0;
+        // In-place 1D transforms need extra dist.
+        if(transform_type == fft_transform_type_real_forward && dim() == 1
+           && placement == fft_placement_inplace)
+        {
+            dist = 2 * (length[0] / 2 + 1) * istride[0];
+            return dist;
+        }
+
+        if(transform_type == fft_transform_type_real_inverse && dim() == 1)
+        {
+            dist = (length[0] / 2 + 1) * istride[0];
+            return dist;
+        }
+
+        dist = (transform_type == fft_transform_type_real_inverse)
+                   ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1]
+                   : length[dim() - 1] * istride[dim() - 1];
+        for(unsigned int i = 0; i < dim() - 1; ++i)
+        {
+            dist = std::max(length[i] * istride[i], dist);
+        }
+        return dist;
+    }
+    void set_idist()
+    {
+        if(idist != 0)
+            return;
+        idist = compute_idist();
+    }
+
+    // Compute the odist for a given transform based on the placeness, transform type, and data
+    // layout.  Row-major.
+    size_t compute_odist() const
+    {
+        size_t dist = 0;
+        // In-place 1D transforms need extra dist.
+        if(transform_type == fft_transform_type_real_inverse && dim() == 1
+           && placement == fft_placement_inplace)
+        {
+            dist = 2 * (length[0] / 2 + 1) * ostride[0];
+            return dist;
+        }
+
+        if(transform_type == fft_transform_type_real_forward && dim() == 1)
+        {
+            dist = (length[0] / 2 + 1) * ostride[0];
+            return dist;
+        }
+
+        dist = (transform_type == fft_transform_type_real_forward)
+                   ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1]
+                   : length[dim() - 1] * ostride[dim() - 1];
+        for(unsigned int i = 0; i < dim() - 1; ++i)
+        {
+            dist = std::max(length[i] * ostride[i], dist);
+        }
+        return dist;
+    }
+    void set_odist()
+    {
+        if(odist != 0)
+            return;
+        odist = compute_odist();
+    }
+
+    // Put the length, stride, batch, and dist into a single length/stride array and pass off to the
+    // validity checker.
+    bool valid_length_stride_batch_dist(const std::vector<size_t>& l0,
+                                        const std::vector<size_t>& s0,
+                                        const size_t               n,
+                                        const size_t               dist,
+                                        const int                  verbose = 0) const
+    {
+        if(l0.size() != s0.size())
+            return false;
+
+        // Length and stride vectors, including bathes:
+        std::vector<size_t> l{}, s{};
+        for(unsigned int i = 0; i < l0.size(); ++i)
+        {
+            if(l0[i] > 1)
+            {
+                if(s0[i] == 0)
+                    return false;
+                l.push_back(l0[i]);
+                s.push_back(s0[i]);
+            }
+        }
+        if(n > 1)
+        {
+            if(dist == 0)
+                return false;
+            l.push_back(n);
+            s.push_back(dist);
+        }
+
+        return array_valid(l, s, verbose);
+    }
+
+    // Return true if the given GPU parameters would produce a valid transform.
+    bool valid(const int verbose) const
+    {
+        if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer())
+            return false;
+
+        // Check that in-place transforms have the same input and output stride:
+        if(placement == fft_placement_inplace)
+        {
+            const auto stridesize = std::min(istride.size(), ostride.size());
+            bool       samestride = true;
+            for(unsigned int i = 0; i < stridesize; ++i)
+            {
+                if(istride[i] != ostride[i])
+                    samestride = false;
+            }
+            if((transform_type == fft_transform_type_complex_forward
+                || transform_type == fft_transform_type_complex_inverse)
+               && !samestride)
+            {
+                // In-place transforms require identical input and output strides.
+                if(verbose)
+                {
+                    std::cout << "istride:";
+                    for(const auto& i : istride)
+                        std::cout << " " << i;
+                    std::cout << " ostride0:";
+                    for(const auto& i : ostride)
+                        std::cout << " " << i;
+                    std::cout << " differ; skipped for in-place transforms: skipping test"
+                              << std::endl;
+                }
+                return false;
+            }
+
+            if((transform_type == fft_transform_type_complex_forward
+                || transform_type == fft_transform_type_complex_inverse)
+               && (idist != odist) && nbatch > 1)
+            {
+                // In-place transforms require identical distance, if
+                // batch > 1.  If batch is 1 then dist is ignored and
+                // the FFT should still work.
+                if(verbose)
+                {
+                    std::cout << "idist:" << idist << " odist:" << odist
+                              << " differ; skipped for in-place transforms: skipping test"
+                              << std::endl;
+                }
+                return false;
+            }
+
+            if((transform_type == fft_transform_type_real_forward
+                || transform_type == fft_transform_type_real_inverse)
+               && (istride.back() != 1 || ostride.back() != 1))
+            {
+                // In-place real/complex transforms require unit strides.
+                if(verbose)
+                {
+                    std::cout
+                        << "istride.back(): " << istride.back()
+                        << " ostride.back(): " << ostride.back()
+                        << " must be unitary for in-place real/complex transforms: skipping test"
+                        << std::endl;
+                }
+                return false;
+            }
+
+            if((itype == fft_array_type_complex_interleaved
+                && otype == fft_array_type_complex_planar)
+               || (itype == fft_array_type_complex_planar
+                   && otype == fft_array_type_complex_interleaved))
+            {
+                if(verbose)
+                {
+                    std::cout << "In-place c2c transforms require identical io types; skipped.\n";
+                }
+                return false;
+            }
+
+            // Check offsets
+            switch(transform_type)
+            {
+            case fft_transform_type_complex_forward:
+            case fft_transform_type_complex_inverse:
+                for(unsigned int i = 0; i < nibuffer(); ++i)
+                {
+                    if(ioffset[i] != ooffset[i])
+                        return false;
+                }
+                break;
+            case fft_transform_type_real_forward:
+                if(ioffset[0] != 2 * ooffset[0])
+                    return false;
+                break;
+            case fft_transform_type_real_inverse:
+                if(2 * ioffset[0] != ooffset[0])
+                    return false;
+                break;
+            }
+        }
+
+        if(!check_iotypes())
+            return false;
+
+        // we can only check output strides on out-of-place
+        // transforms, since we need to initialize output to a known
+        // pattern
+        if(placement == fft_placement_inplace && check_output_strides)
+            return false;
+
+        // Check input and output strides
+        if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true)
+        {
+            if(verbose)
+                std::cout << "Invalid input data format.\n";
+            return false;
+        }
+        if(!(ilength() == olength() && istride == ostride && idist == odist))
+        {
+            // Only check if different
+            if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true)
+            {
+                if(verbose)
+                    std::cout << "Invalid output data format.\n";
+                return false;
+            }
+        }
+
+        // The parameters are valid.
+        return true;
+    }
+
+    // Fill in any missing parameters.
+    void validate()
+    {
+        set_iotypes();
+        compute_istride();
+        compute_ostride();
+        set_idist();
+        set_odist();
+        compute_isize();
+        compute_osize();
+
+        validate_fields();
+    }
+
+    virtual void validate_fields() const
+    {
+        if(!ifields.empty() || !ofields.empty())
+            throw std::runtime_error("input/output fields are unsupported");
+    }
+
+    // Column-major getters:
+    std::vector<size_t> length_cm() const
+    {
+        auto length_cm = length;
+        std::reverse(std::begin(length_cm), std::end(length_cm));
+        return length_cm;
+    }
+    std::vector<size_t> ilength_cm() const
+    {
+        auto ilength_cm = ilength();
+        std::reverse(std::begin(ilength_cm), std::end(ilength_cm));
+        return ilength_cm;
+    }
+    std::vector<size_t> olength_cm() const
+    {
+        auto olength_cm = olength();
+        std::reverse(std::begin(olength_cm), std::end(olength_cm));
+        return olength_cm;
+    }
+    std::vector<size_t> istride_cm() const
+    {
+        auto istride_cm = istride;
+        std::reverse(std::begin(istride_cm), std::end(istride_cm));
+        return istride_cm;
+    }
+    std::vector<size_t> ostride_cm() const
+    {
+        auto ostride_cm = ostride;
+        std::reverse(std::begin(ostride_cm), std::end(ostride_cm));
+        return ostride_cm;
+    }
+    bool is_planar() const
+    {
+        if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar)
+            return true;
+        if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar)
+            return true;
+        return false;
+    }
+
+    // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary.
+    template <typename Tbuff>
+    inline void compute_input(std::vector<Tbuff>& input)
+    {
+        auto deviceProp = get_curr_device_prop();
+
+        switch(precision)
+        {
+        case fft_precision_half:
+            set_input<Tbuff, _Float16>(
+                input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
+            break;
+        case fft_precision_double:
+            set_input<Tbuff, double>(
+                input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
+            break;
+        case fft_precision_single:
+            set_input<Tbuff, float>(
+                input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
+            break;
+        }
+    }
+
+    template <typename Tstream = std::ostream>
+    void print_ibuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
+                break;
+            }
+            }
+            break;
+        }
+        default:
+            throw std::runtime_error("Invalid itype in print_ibuffer");
+        }
+    }
+
+    template <typename Tstream = std::ostream>
+    void print_obuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
+    {
+        switch(otype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_double:
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
+                break;
+            }
+            }
+            break;
+        }
+
+        default:
+            throw std::runtime_error("Invalid itype in print_obuffer");
+        }
+    }
+
+    void print_ibuffer_flat(const std::vector<hostbuf>& buf) const
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_double:
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
+        }
+        }
+    }
+
+    void print_obuffer_flat(const std::vector<hostbuf>& buf) const
+    {
+        switch(otype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<rocfft_complex<_Float16>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<rocfft_complex<float>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_double:
+                buffer_printer<rocfft_complex<double>> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            break;
+        }
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+        case fft_array_type_real:
+        {
+            switch(precision)
+            {
+            case fft_precision_half:
+            {
+                buffer_printer<_Float16> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            case fft_precision_single:
+            {
+                buffer_printer<float> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+
+            case fft_precision_double:
+            {
+                buffer_printer<double> s;
+                s.print_buffer_flat(buf, osize, ooffset);
+                break;
+            }
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
+        }
+        }
+    }
+
+    virtual fft_status set_callbacks(void* load_cb_host,
+                                     void* load_cb_data,
+                                     void* store_cb_host,
+                                     void* store_cb_data)
+    {
+        return fft_status_success;
+    }
+
+    virtual fft_status execute(void** in, void** out)
+    {
+        return fft_status_success;
+    };
+
+    size_t fft_params_vram_footprint()
+    {
+        return fft_params::vram_footprint();
+    }
+
+    virtual size_t vram_footprint()
+    {
+        const auto ibuf_size = ibuffer_sizes();
+        size_t     val       = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1);
+        if(placement == fft_placement_notinplace)
+        {
+            const auto obuf_size = obuffer_sizes();
+            val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1);
+        }
+        return val;
+    }
+
+    // Specific exception type for work buffer allocation failure.
+    // Tests that hit this can't fit on the GPU and should be skipped.
+    struct work_buffer_alloc_failure : public std::runtime_error
+    {
+        work_buffer_alloc_failure(const std::string& s)
+            : std::runtime_error(s)
+        {
+        }
+    };
+
+    virtual fft_status create_plan()
+    {
+        return fft_status_success;
+    }
+
+    // Change a forward transform to it's inverse
+    void inverse_from_forward(fft_params& params_forward)
+    {
+        switch(params_forward.transform_type)
+        {
+        case fft_transform_type_complex_forward:
+            transform_type = fft_transform_type_complex_inverse;
+            break;
+        case fft_transform_type_real_forward:
+            transform_type = fft_transform_type_real_inverse;
+            break;
+        default:
+            throw std::runtime_error("Transform type not forward.");
+        }
+
+        length    = params_forward.length;
+        istride   = params_forward.ostride;
+        ostride   = params_forward.istride;
+        nbatch    = params_forward.nbatch;
+        precision = params_forward.precision;
+        placement = params_forward.placement;
+        idist     = params_forward.odist;
+        odist     = params_forward.idist;
+        itype     = params_forward.otype;
+        otype     = params_forward.itype;
+        ioffset   = params_forward.ooffset;
+        ooffset   = params_forward.ioffset;
+
+        run_callbacks = params_forward.run_callbacks;
+
+        check_output_strides = params_forward.check_output_strides;
+
+        scale_factor = 1 / params_forward.scale_factor;
+    }
+
+    // prepare for multi-GPU transform.  Generated input is in ibuffer.
+    // pibuffer, pobuffer are the pointers that will be passed to the
+    // FFT library's "execute" API.
+    virtual void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
+                                   std::vector<void*>&  pibuffer,
+                                   std::vector<void*>&  pobuffer)
+    {
+    }
+
+    // finalize multi-GPU transform.  pobuffers are the pointers
+    // provided to the FFT library's "execute" API.  obuffer is the
+    // buffer where transform output needs to go for validation
+    virtual void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) {}
+
+    // create bricks in the specified field for the specified number
+    // of devices.  The field is split along the highest FFT
+    // dimension, and the length only includes FFT lengths, not batch
+    // dimension.
+    void distribute_field(int                        deviceCount,
+                          std::vector<fft_field>&    fields,
+                          const std::vector<size_t>& field_length)
+    {
+        size_t slowLen = field_length.front();
+        if(slowLen < static_cast<size_t>(deviceCount))
+            throw std::runtime_error("too many devices to distribute length "
+                                     + std::to_string(slowLen));
+
+        auto& field = fields.emplace_back();
+
+        for(int i = 0; i < deviceCount; ++i)
+        {
+            // start at origin
+            std::vector<size_t> field_lower(field_length.size());
+            std::vector<size_t> field_upper(field_length.size());
+
+            // note: slowest FFT dim is index 0 in these coordinates
+            field_lower[0] = slowLen / deviceCount * i;
+
+            // last brick needs to include the whole slow len
+            if(i == deviceCount - 1)
+            {
+                field_upper[0] = slowLen;
+            }
+            else
+            {
+                field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount);
+            }
+
+            for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim)
+            {
+                field_upper[upperDim] = field_length[upperDim];
+            }
+
+            // field coordinates also need to include batch
+            field_lower.insert(field_lower.begin(), 0);
+            field_upper.insert(field_upper.begin(), nbatch);
+
+            // bricks have contiguous strides
+            size_t              brick_dist = 1;
+            std::vector<size_t> brick_stride(field_lower.size());
+            for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx)
+            {
+                // fill strides from fastest to slowest
+                *(brick_stride.rbegin() + distIdx) = brick_dist;
+                brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx);
+            }
+            field.bricks.push_back(
+                fft_params::fft_brick{field_lower, field_upper, brick_stride, i});
+        }
+    }
+
+    void distribute_input(int deviceCount)
+    {
+        distribute_field(deviceCount, ifields, length);
+    }
+
+    void distribute_output(int deviceCount)
+    {
+        distribute_field(deviceCount, ofields, olength());
+    }
+};
+
+// This is used with the program_options class so that the user can type an integer on the
+// command line and we store into an enum varaible
+template <typename _Elem, typename _Traits>
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
+                                               fft_array_type&                     atype)
+{
+    unsigned tmp;
+    stream >> tmp;
+    atype = fft_array_type(tmp);
+    return stream;
+}
+
+// similarly for transform type
+template <typename _Elem, typename _Traits>
+std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
+                                               fft_transform_type&                 ttype)
+{
+    unsigned tmp;
+    stream >> tmp;
+    ttype = fft_transform_type(tmp);
+    return stream;
+}
+
+// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
+template <typename T1>
+std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length)
+{
+    return partition_base(length, compute_partition_count(length));
+}
+
+// Partition on the rightmost part of the tuple, for col-major indexing
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
+    partition_colmajor(const std::tuple<T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<1>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<1>(ret[i].first)  = partitions[i].first;
+        std::get<0>(ret[i].first)  = 0;
+        std::get<1>(ret[i].second) = partitions[i].second;
+        std::get<0>(ret[i].second) = std::get<0>(length);
+    }
+    return ret;
+}
+template <typename T1>
+std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
+    partition_colmajor(const std::tuple<T1, T1, T1>& length)
+{
+    auto partitions = partition_base(std::get<2>(length), compute_partition_count(length));
+    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
+    for(size_t i = 0; i < partitions.size(); ++i)
+    {
+        std::get<2>(ret[i].first)  = partitions[i].first;
+        std::get<1>(ret[i].first)  = 0;
+        std::get<0>(ret[i].first)  = 0;
+        std::get<2>(ret[i].second) = partitions[i].second;
+        std::get<1>(ret[i].second) = std::get<1>(length);
+        std::get<0>(ret[i].second) = std::get<0>(length);
+    }
+    return ret;
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input and output
+// types are identical.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_1to1(const Tval*                input,
+                              Tval*                      output,
+                              const Tint1&               whole_length,
+                              const size_t               nbatch,
+                              const Tint2&               istride,
+                              const size_t               idist,
+                              const Tint3&               ostride,
+                              const size_t               odist,
+                              const std::vector<size_t>& ioffset,
+                              const std::vector<size_t>& ooffset)
+{
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                output[odx + ooffset[0]] = input[idx + ioffset[0]];
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input type is
+// planar and the output type is complex interleaved.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_2to1(const Tval*                input0,
+                              const Tval*                input1,
+                              rocfft_complex<Tval>*      output,
+                              const Tint1&               whole_length,
+                              const size_t               nbatch,
+                              const Tint2&               istride,
+                              const size_t               idist,
+                              const Tint3&               ostride,
+                              const size_t               odist,
+                              const std::vector<size_t>& ioffset,
+                              const std::vector<size_t>& ooffset)
+{
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                output[odx + ooffset[0]]
+                    = rocfft_complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]);
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input type is
+// complex interleaved and the output type is planar.
+template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers_1to2(const rocfft_complex<Tval>* input,
+                              Tval*                       output0,
+                              Tval*                       output1,
+                              const Tint1&                whole_length,
+                              const size_t                nbatch,
+                              const Tint2&                istride,
+                              const size_t                idist,
+                              const Tint3&                ostride,
+                              const size_t                odist,
+                              const std::vector<size_t>&  ioffset,
+                              const std::vector<size_t>&  ooffset)
+{
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            auto       index  = partitions[part].first;
+            const auto length = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                output0[odx + ooffset[0]] = input[idx + ioffset[0]].real();
+                output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag();
+            } while(increment_rowmajor(index, length));
+        }
+    }
+}
+
+// Copy data of dimensions length with strides istride and length idist between batches to
+// a buffer with strides ostride and length odist between batches.  The input type given
+// by itype, and the output type is given by otype.
+template <typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers(const std::vector<hostbuf>& input,
+                         std::vector<hostbuf>&       output,
+                         const Tint1&                length,
+                         const size_t                nbatch,
+                         const fft_precision         precision,
+                         const fft_array_type        itype,
+                         const Tint2&                istride,
+                         const size_t                idist,
+                         const fft_array_type        otype,
+                         const Tint3&                ostride,
+                         const size_t                odist,
+                         const std::vector<size_t>&  ioffset,
+                         const std::vector<size_t>&  ooffset)
+{
+    if(itype == otype)
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+            switch(precision)
+            {
+            case fft_precision_half:
+                copy_buffers_1to1(
+                    reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                    reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    ioffset,
+                    ooffset);
+                break;
+            case fft_precision_single:
+                copy_buffers_1to1(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                                  reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
+                                  length,
+                                  nbatch,
+                                  istride,
+                                  idist,
+                                  ostride,
+                                  odist,
+                                  ioffset,
+                                  ooffset);
+                break;
+            case fft_precision_double:
+                copy_buffers_1to1(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                                  reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
+                                  length,
+                                  nbatch,
+                                  istride,
+                                  idist,
+                                  ostride,
+                                  odist,
+                                  ioffset,
+                                  ooffset);
+                break;
+            }
+            break;
+        case fft_array_type_real:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            for(unsigned int idx = 0; idx < input.size(); ++idx)
+            {
+                switch(precision)
+                {
+                case fft_precision_half:
+                    copy_buffers_1to1(reinterpret_cast<const _Float16*>(input[idx].data()),
+                                      reinterpret_cast<_Float16*>(output[idx].data()),
+                                      length,
+                                      nbatch,
+                                      istride,
+                                      idist,
+                                      ostride,
+                                      odist,
+                                      ioffset,
+                                      ooffset);
+                    break;
+                case fft_precision_single:
+                    copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()),
+                                      reinterpret_cast<float*>(output[idx].data()),
+                                      length,
+                                      nbatch,
+                                      istride,
+                                      idist,
+                                      ostride,
+                                      odist,
+                                      ioffset,
+                                      ooffset);
+                    break;
+                case fft_precision_double:
+                    copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()),
+                                      reinterpret_cast<double*>(output[idx].data()),
+                                      length,
+                                      nbatch,
+                                      istride,
+                                      idist,
+                                      ostride,
+                                      odist,
+                                      ioffset,
+                                      ooffset);
+                    break;
+                }
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid data type");
+        }
+    }
+    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
+            || (itype == fft_array_type_hermitian_interleaved
+                && otype == fft_array_type_hermitian_planar))
+    {
+        // copy 1to2
+        switch(precision)
+        {
+        case fft_precision_half:
+            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                              reinterpret_cast<_Float16*>(output[0].data()),
+                              reinterpret_cast<_Float16*>(output[1].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_single:
+            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                              reinterpret_cast<float*>(output[0].data()),
+                              reinterpret_cast<float*>(output[1].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_double:
+            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                              reinterpret_cast<double*>(output[0].data()),
+                              reinterpret_cast<double*>(output[1].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        }
+    }
+    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
+            || (itype == fft_array_type_hermitian_planar
+                && otype == fft_array_type_hermitian_interleaved))
+    {
+        // copy 2 to 1
+        switch(precision)
+        {
+        case fft_precision_half:
+            copy_buffers_2to1(reinterpret_cast<const _Float16*>(input[0].data()),
+                              reinterpret_cast<const _Float16*>(input[1].data()),
+                              reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_single:
+            copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()),
+                              reinterpret_cast<const float*>(input[1].data()),
+                              reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        case fft_precision_double:
+            copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()),
+                              reinterpret_cast<const double*>(input[1].data()),
+                              reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              ostride,
+                              odist,
+                              ioffset,
+                              ooffset);
+            break;
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Invalid input and output types.");
+    }
+}
+
+// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions
+template <typename Tint1, typename Tint2, typename Tint3>
+inline void copy_buffers(const std::vector<hostbuf>& input,
+                         std::vector<hostbuf>&       output,
+                         const std::vector<Tint1>&   length,
+                         const size_t                nbatch,
+                         const fft_precision         precision,
+                         const fft_array_type        itype,
+                         const std::vector<Tint2>&   istride,
+                         const size_t                idist,
+                         const fft_array_type        otype,
+                         const std::vector<Tint3>&   ostride,
+                         const size_t                odist,
+                         const std::vector<size_t>&  ioffset,
+                         const std::vector<size_t>&  ooffset)
+{
+    switch(length.size())
+    {
+    case 1:
+        return copy_buffers(input,
+                            output,
+                            length[0],
+                            nbatch,
+                            precision,
+                            itype,
+                            istride[0],
+                            idist,
+                            otype,
+                            ostride[0],
+                            odist,
+                            ioffset,
+                            ooffset);
+    case 2:
+        return copy_buffers(input,
+                            output,
+                            std::make_tuple(length[0], length[1]),
+                            nbatch,
+                            precision,
+                            itype,
+                            std::make_tuple(istride[0], istride[1]),
+                            idist,
+                            otype,
+                            std::make_tuple(ostride[0], ostride[1]),
+                            odist,
+                            ioffset,
+                            ooffset);
+    case 3:
+        return copy_buffers(input,
+                            output,
+                            std::make_tuple(length[0], length[1], length[2]),
+                            nbatch,
+                            precision,
+                            itype,
+                            std::make_tuple(istride[0], istride[1], istride[2]),
+                            idist,
+                            otype,
+                            std::make_tuple(ostride[0], ostride[1], ostride[2]),
+                            odist,
+                            ioffset,
+                            ooffset);
+    default:
+        abort();
+    }
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches.  Both buffers are of complex type.
+
+struct VectorNorms
+{
+    double l_2 = 0.0, l_inf = 0.0;
+};
+
+template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance_1to1_complex(const Tcomplex*                         input,
+                                         const Tcomplex*                         output,
+                                         const Tint1&                            whole_length,
+                                         const size_t                            nbatch,
+                                         const Tint2&                            istride,
+                                         const size_t                            idist,
+                                         const Tint3&                            ostride,
+                                         const size_t                            odist,
+                                         std::vector<std::pair<size_t, size_t>>* linf_failures,
+                                         const double                            linf_cutoff,
+                                         const std::vector<size_t>&              ioffset,
+                                         const std::vector<size_t>&              ooffset,
+                                         const double output_scalar = 1.0)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    std::mutex                             linf_failure_lock;
+    std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_colmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                const double rdiff
+                    = std::abs(static_cast<double>(output[odx + ooffset[0]].real()) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].real()));
+                cur_linf = std::max(rdiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += rdiff * rdiff;
+
+                const double idiff
+                    = std::abs(static_cast<double>(output[odx + ooffset[0]].imag()) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].imag()));
+                cur_linf = std::max(idiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += idiff * idiff;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+
+            if(linf_failures)
+            {
+                linf_failure_lock.lock();
+                std::copy(linf_failures_private.begin(),
+                          linf_failures_private.end(),
+                          std::back_inserter(*linf_failures));
+                linf_failure_lock.unlock();
+            }
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches.  Both buffers are of real type.
+template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance_1to1_real(const Tfloat*                           input,
+                                      const Tfloat*                           output,
+                                      const Tint1&                            whole_length,
+                                      const size_t                            nbatch,
+                                      const Tint2&                            istride,
+                                      const size_t                            idist,
+                                      const Tint3&                            ostride,
+                                      const size_t                            odist,
+                                      std::vector<std::pair<size_t, size_t>>* linf_failures,
+                                      const double                            linf_cutoff,
+                                      const std::vector<size_t>&              ioffset,
+                                      const std::vector<size_t>&              ooffset,
+                                      const double                            output_scalar = 1.0)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    std::mutex                             linf_failure_lock;
+    std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                const double diff
+                    = std::abs(static_cast<double>(output[odx + ooffset[0]]) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]]));
+                cur_linf = std::max(diff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += diff * diff;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+
+            if(linf_failures)
+            {
+                linf_failure_lock.lock();
+                std::copy(linf_failures_private.begin(),
+                          linf_failures_private.end(),
+                          std::back_inserter(*linf_failures));
+                linf_failure_lock.unlock();
+            }
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 distance between two buffers with strides istride and
+// length idist between batches to a buffer with strides ostride and length odist between
+// batches.  input is complex-interleaved, output is complex-planar.
+template <typename Tval, typename Tint1, typename T2, typename T3>
+inline VectorNorms distance_1to2(const rocfft_complex<Tval>*             input,
+                                 const Tval*                             output0,
+                                 const Tval*                             output1,
+                                 const Tint1&                            whole_length,
+                                 const size_t                            nbatch,
+                                 const T2&                               istride,
+                                 const size_t                            idist,
+                                 const T3&                               ostride,
+                                 const size_t                            odist,
+                                 std::vector<std::pair<size_t, size_t>>* linf_failures,
+                                 const double                            linf_cutoff,
+                                 const std::vector<size_t>&              ioffset,
+                                 const std::vector<size_t>&              ooffset,
+                                 const double                            output_scalar = 1.0)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    std::mutex                             linf_failure_lock;
+    std::vector<std::pair<size_t, size_t>> linf_failures_private;
+
+    const bool idx_equals_odx = istride == ostride && idist == odist;
+    size_t     idx_base       = 0;
+    size_t     odx_base       = 0;
+    auto       partitions     = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
+                const double rdiff
+                    = std::abs(static_cast<double>(output0[odx + ooffset[0]]) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].real()));
+                cur_linf = std::max(rdiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += rdiff * rdiff;
+
+                const double idiff
+                    = std::abs(static_cast<double>(output1[odx + ooffset[1]]) * output_scalar
+                               - static_cast<double>(input[idx + ioffset[0]].imag()));
+                cur_linf = std::max(idiff, cur_linf);
+                if(cur_linf > linf_cutoff)
+                {
+                    std::pair<size_t, size_t> fval(b, idx);
+                    if(linf_failures)
+                        linf_failures_private.push_back(fval);
+                }
+                cur_l2 += idiff * idiff;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+
+            if(linf_failures)
+            {
+                linf_failure_lock.lock();
+                std::copy(linf_failures_private.begin(),
+                          linf_failures_private.end(),
+                          std::back_inserter(*linf_failures));
+                linf_failure_lock.unlock();
+            }
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-inifnity and L-2 distance between two buffers of dimension length and
+// with types given by itype, otype, and precision.
+template <typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance(const std::vector<hostbuf>&             input,
+                            const std::vector<hostbuf>&             output,
+                            const Tint1&                            length,
+                            const size_t                            nbatch,
+                            const fft_precision                     precision,
+                            const fft_array_type                    itype,
+                            const Tint2&                            istride,
+                            const size_t                            idist,
+                            const fft_array_type                    otype,
+                            const Tint3&                            ostride,
+                            const size_t                            odist,
+                            std::vector<std::pair<size_t, size_t>>* linf_failures,
+                            const double                            linf_cutoff,
+                            const std::vector<size_t>&              ioffset,
+                            const std::vector<size_t>&              ooffset,
+                            const double                            output_scalar = 1.0)
+{
+    VectorNorms dist;
+
+    if(itype == otype)
+    {
+        switch(itype)
+        {
+        case fft_array_type_complex_interleaved:
+        case fft_array_type_hermitian_interleaved:
+            switch(precision)
+            {
+            case fft_precision_half:
+                dist = distance_1to1_complex(
+                    reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                    reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    linf_failures,
+                    linf_cutoff,
+                    ioffset,
+                    ooffset,
+                    output_scalar);
+                break;
+            case fft_precision_single:
+                dist = distance_1to1_complex(
+                    reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                    reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    linf_failures,
+                    linf_cutoff,
+                    ioffset,
+                    ooffset,
+                    output_scalar);
+                break;
+            case fft_precision_double:
+                dist = distance_1to1_complex(
+                    reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                    reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
+                    length,
+                    nbatch,
+                    istride,
+                    idist,
+                    ostride,
+                    odist,
+                    linf_failures,
+                    linf_cutoff,
+                    ioffset,
+                    ooffset,
+                    output_scalar);
+                break;
+            }
+            dist.l_2 *= dist.l_2;
+            break;
+        case fft_array_type_real:
+        case fft_array_type_complex_planar:
+        case fft_array_type_hermitian_planar:
+            for(unsigned int idx = 0; idx < input.size(); ++idx)
+            {
+                VectorNorms d;
+                switch(precision)
+                {
+                case fft_precision_half:
+                    d = distance_1to1_real(reinterpret_cast<const _Float16*>(input[idx].data()),
+                                           reinterpret_cast<const _Float16*>(output[idx].data()),
+                                           length,
+                                           nbatch,
+                                           istride,
+                                           idist,
+                                           ostride,
+                                           odist,
+                                           linf_failures,
+                                           linf_cutoff,
+                                           ioffset,
+                                           ooffset,
+                                           output_scalar);
+                    break;
+                case fft_precision_single:
+                    d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()),
+                                           reinterpret_cast<const float*>(output[idx].data()),
+                                           length,
+                                           nbatch,
+                                           istride,
+                                           idist,
+                                           ostride,
+                                           odist,
+                                           linf_failures,
+                                           linf_cutoff,
+                                           ioffset,
+                                           ooffset,
+                                           output_scalar);
+                    break;
+                case fft_precision_double:
+                    d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()),
+                                           reinterpret_cast<const double*>(output[idx].data()),
+                                           length,
+                                           nbatch,
+                                           istride,
+                                           idist,
+                                           ostride,
+                                           odist,
+                                           linf_failures,
+                                           linf_cutoff,
+                                           ioffset,
+                                           ooffset,
+                                           output_scalar);
+                    break;
+                }
+                dist.l_inf = std::max(d.l_inf, dist.l_inf);
+                dist.l_2 += d.l_2 * d.l_2;
+            }
+            break;
+        default:
+            throw std::runtime_error("Invalid input and output types.");
+        }
+    }
+    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
+            || (itype == fft_array_type_hermitian_interleaved
+                && otype == fft_array_type_hermitian_planar))
+    {
+        switch(precision)
+        {
+        case fft_precision_half:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                                 reinterpret_cast<const _Float16*>(output[0].data()),
+                                 reinterpret_cast<const _Float16*>(output[1].data()),
+                                 length,
+                                 nbatch,
+                                 istride,
+                                 idist,
+                                 ostride,
+                                 odist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        case fft_precision_single:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                                 reinterpret_cast<const float*>(output[0].data()),
+                                 reinterpret_cast<const float*>(output[1].data()),
+                                 length,
+                                 nbatch,
+                                 istride,
+                                 idist,
+                                 ostride,
+                                 odist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        case fft_precision_double:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                                 reinterpret_cast<const double*>(output[0].data()),
+                                 reinterpret_cast<const double*>(output[1].data()),
+                                 length,
+                                 nbatch,
+                                 istride,
+                                 idist,
+                                 ostride,
+                                 odist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        }
+        dist.l_2 *= dist.l_2;
+    }
+    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
+            || (itype == fft_array_type_hermitian_planar
+                && otype == fft_array_type_hermitian_interleaved))
+    {
+        switch(precision)
+        {
+        case fft_precision_half:
+            dist
+                = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
+                                reinterpret_cast<const _Float16*>(input[0].data()),
+                                reinterpret_cast<const _Float16*>(input[1].data()),
+                                length,
+                                nbatch,
+                                ostride,
+                                odist,
+                                istride,
+                                idist,
+                                linf_failures,
+                                linf_cutoff,
+                                ioffset,
+                                ooffset,
+                                output_scalar);
+            break;
+        case fft_precision_single:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
+                                 reinterpret_cast<const float*>(input[0].data()),
+                                 reinterpret_cast<const float*>(input[1].data()),
+                                 length,
+                                 nbatch,
+                                 ostride,
+                                 odist,
+                                 istride,
+                                 idist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        case fft_precision_double:
+            dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
+                                 reinterpret_cast<const double*>(input[0].data()),
+                                 reinterpret_cast<const double*>(input[1].data()),
+                                 length,
+                                 nbatch,
+                                 ostride,
+                                 odist,
+                                 istride,
+                                 idist,
+                                 linf_failures,
+                                 linf_cutoff,
+                                 ioffset,
+                                 ooffset,
+                                 output_scalar);
+            break;
+        }
+        dist.l_2 *= dist.l_2;
+    }
+    else
+    {
+        throw std::runtime_error("Invalid input and output types.");
+    }
+    dist.l_2 = sqrt(dist.l_2);
+    return dist;
+}
+
+// check if the specified length + stride/dist is contiguous
+template <typename Tint1, typename Tint2>
+bool is_contiguous_rowmajor(const std::vector<Tint1>& length,
+                            const std::vector<Tint2>& stride,
+                            size_t                    dist)
+{
+    size_t expected_stride = 1;
+    auto   stride_it       = stride.rbegin();
+    auto   length_it       = length.rbegin();
+    for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it)
+    {
+        if(*stride_it != expected_stride)
+            return false;
+        expected_stride *= *length_it;
+    }
+    return expected_stride == dist;
+}
+
+// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions
+template <typename Tint1, typename Tint2, typename Tint3>
+inline VectorNorms distance(const std::vector<hostbuf>&             input,
+                            const std::vector<hostbuf>&             output,
+                            std::vector<Tint1>                      length,
+                            size_t                                  nbatch,
+                            const fft_precision                     precision,
+                            const fft_array_type                    itype,
+                            std::vector<Tint2>                      istride,
+                            const size_t                            idist,
+                            const fft_array_type                    otype,
+                            std::vector<Tint3>                      ostride,
+                            const size_t                            odist,
+                            std::vector<std::pair<size_t, size_t>>* linf_failures,
+                            const double                            linf_cutoff,
+                            const std::vector<size_t>&              ioffset,
+                            const std::vector<size_t>&              ooffset,
+                            const double                            output_scalar = 1.0)
+{
+    // If istride and ostride are both contiguous, collapse them down
+    // to one dimension.  Index calculation is simpler (and faster)
+    // in the 1D case.
+    if(is_contiguous_rowmajor(length, istride, idist)
+       && is_contiguous_rowmajor(length, ostride, odist))
+    {
+        length  = {product(length.begin(), length.end()) * nbatch};
+        istride = {static_cast<Tint2>(1)};
+        ostride = {static_cast<Tint3>(1)};
+        nbatch  = 1;
+    }
+
+    switch(length.size())
+    {
+    case 1:
+        return distance(input,
+                        output,
+                        length[0],
+                        nbatch,
+                        precision,
+                        itype,
+                        istride[0],
+                        idist,
+                        otype,
+                        ostride[0],
+                        odist,
+                        linf_failures,
+                        linf_cutoff,
+                        ioffset,
+                        ooffset,
+                        output_scalar);
+    case 2:
+        return distance(input,
+                        output,
+                        std::make_tuple(length[0], length[1]),
+                        nbatch,
+                        precision,
+                        itype,
+                        std::make_tuple(istride[0], istride[1]),
+                        idist,
+                        otype,
+                        std::make_tuple(ostride[0], ostride[1]),
+                        odist,
+                        linf_failures,
+                        linf_cutoff,
+                        ioffset,
+                        ooffset,
+                        output_scalar);
+    case 3:
+        return distance(input,
+                        output,
+                        std::make_tuple(length[0], length[1], length[2]),
+                        nbatch,
+                        precision,
+                        itype,
+                        std::make_tuple(istride[0], istride[1], istride[2]),
+                        idist,
+                        otype,
+                        std::make_tuple(ostride[0], ostride[1], ostride[2]),
+                        odist,
+                        linf_failures,
+                        linf_cutoff,
+                        ioffset,
+                        ooffset,
+                        output_scalar);
+    default:
+        abort();
+    }
+}
+
+// Compute the L-infinity and L-2 norm of a buffer with strides istride and
+// length idist.  Data is rocfft_complex.
+template <typename Tcomplex, typename T1, typename T2>
+inline VectorNorms norm_complex(const Tcomplex*            input,
+                                const T1&                  whole_length,
+                                const size_t               nbatch,
+                                const T2&                  istride,
+                                const size_t               idist,
+                                const std::vector<size_t>& offset)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    size_t idx_base   = 0;
+    auto   partitions = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto idx = compute_index(index, istride, idx_base);
+
+                const double rval = std::abs(static_cast<double>(input[idx + offset[0]].real()));
+                cur_linf          = std::max(rval, cur_linf);
+                cur_l2 += rval * rval;
+
+                const double ival = std::abs(static_cast<double>(input[idx + offset[0]].imag()));
+                cur_linf          = std::max(ival, cur_linf);
+                cur_l2 += ival * ival;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
+// length idist.  Data is real-valued.
+template <typename Tfloat, typename T1, typename T2>
+inline VectorNorms norm_real(const Tfloat*              input,
+                             const T1&                  whole_length,
+                             const size_t               nbatch,
+                             const T2&                  istride,
+                             const size_t               idist,
+                             const std::vector<size_t>& offset)
+{
+    double linf = 0.0;
+    double l2   = 0.0;
+
+    size_t idx_base   = 0;
+    auto   partitions = partition_rowmajor(whole_length);
+    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
+#endif
+        for(size_t part = 0; part < partitions.size(); ++part)
+        {
+            double     cur_linf = 0.0;
+            double     cur_l2   = 0.0;
+            auto       index    = partitions[part].first;
+            const auto length   = partitions[part].second;
+            do
+            {
+                const auto   idx = compute_index(index, istride, idx_base);
+                const double val = std::abs(static_cast<double>(input[idx + offset[0]]));
+                cur_linf         = std::max(val, cur_linf);
+                cur_l2 += val * val;
+
+            } while(increment_rowmajor(index, length));
+            linf = std::max(linf, cur_linf);
+            l2 += cur_l2;
+        }
+    }
+    return {.l_2 = sqrt(l2), .l_inf = linf};
+}
+
+// Compute the L-infinity and L-2 norm of abuffer with strides istride and
+// length idist.  Data format is given by precision and itype.
+template <typename T1, typename T2>
+inline VectorNorms norm(const std::vector<hostbuf>& input,
+                        const T1&                   length,
+                        const size_t                nbatch,
+                        const fft_precision         precision,
+                        const fft_array_type        itype,
+                        const T2&                   istride,
+                        const size_t                idist,
+                        const std::vector<size_t>&  offset)
+{
+    VectorNorms norm;
+
+    switch(itype)
+    {
+    case fft_array_type_complex_interleaved:
+    case fft_array_type_hermitian_interleaved:
+        switch(precision)
+        {
+        case fft_precision_half:
+            norm = norm_complex(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
+                                length,
+                                nbatch,
+                                istride,
+                                idist,
+                                offset);
+            break;
+        case fft_precision_single:
+            norm = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
+                                length,
+                                nbatch,
+                                istride,
+                                idist,
+                                offset);
+            break;
+        case fft_precision_double:
+            norm = norm_complex(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
+                                length,
+                                nbatch,
+                                istride,
+                                idist,
+                                offset);
+            break;
+        }
+        norm.l_2 *= norm.l_2;
+        break;
+    case fft_array_type_real:
+    case fft_array_type_complex_planar:
+    case fft_array_type_hermitian_planar:
+        for(unsigned int idx = 0; idx < input.size(); ++idx)
+        {
+            VectorNorms n;
+            switch(precision)
+            {
+            case fft_precision_half:
+                n = norm_real(reinterpret_cast<const _Float16*>(input[idx].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              offset);
+                break;
+            case fft_precision_single:
+                n = norm_real(reinterpret_cast<const float*>(input[idx].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              offset);
+                break;
+            case fft_precision_double:
+                n = norm_real(reinterpret_cast<const double*>(input[idx].data()),
+                              length,
+                              nbatch,
+                              istride,
+                              idist,
+                              offset);
+                break;
+            }
+            norm.l_inf = std::max(n.l_inf, norm.l_inf);
+            norm.l_2 += n.l_2 * n.l_2;
+        }
+        break;
+    default:
+        throw std::runtime_error("Invalid data type");
+    }
+
+    norm.l_2 = sqrt(norm.l_2);
+    return norm;
+}
+
+// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions
+template <typename T1, typename T2>
+inline VectorNorms norm(const std::vector<hostbuf>& input,
+                        std::vector<T1>             length,
+                        size_t                      nbatch,
+                        const fft_precision         precision,
+                        const fft_array_type        type,
+                        std::vector<T2>             stride,
+                        const size_t                dist,
+                        const std::vector<size_t>&  offset)
+{
+    // If stride is contiguous, collapse it down to one dimension.
+    // Index calculation is simpler (and faster) in the 1D case.
+    if(is_contiguous_rowmajor(length, stride, dist))
+    {
+        length = {product(length.begin(), length.end()) * nbatch};
+        stride = {static_cast<T2>(1)};
+        nbatch = 1;
+    }
+
+    switch(length.size())
+    {
+    case 1:
+        return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset);
+    case 2:
+        return norm(input,
+                    std::make_tuple(length[0], length[1]),
+                    nbatch,
+                    precision,
+                    type,
+                    std::make_tuple(stride[0], stride[1]),
+                    dist,
+                    offset);
+    case 3:
+        return norm(input,
+                    std::make_tuple(length[0], length[1], length[2]),
+                    nbatch,
+                    precision,
+                    type,
+                    std::make_tuple(stride[0], stride[1], stride[2]),
+                    dist,
+                    offset);
+    default:
+        abort();
+    }
+}
+
+// Given a data type and precision, the distance between batches, and
+// the batch size, allocate the required host buffer(s).
+static std::vector<hostbuf> allocate_host_buffer(const fft_precision        precision,
+                                                 const fft_array_type       type,
+                                                 const std::vector<size_t>& size)
+{
+    std::vector<hostbuf> buffers(size.size());
+    for(unsigned int i = 0; i < size.size(); ++i)
+    {
+        buffers[i].alloc(size[i] * var_size<size_t>(precision, type));
+    }
+    return buffers;
+}
+
+// Check if the required buffers fit in the device vram.
+inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0)
+{
+    // We keep a small margin of error for fitting the problem into vram:
+    const size_t extra = 1 << 27;
+
+    return vram_avail > prob_size + extra;
+}
+
+// Computes the twiddle table VRAM footprint for r2c/c2r transforms.
+// This function will return 0 for the other transform types, since
+// the VRAM footprint in rocFFT is negligible for the other cases.
+inline size_t twiddle_table_vram_footprint(const fft_params& params)
+{
+    size_t vram_footprint = 0;
+
+    // Add vram footprint from real/complex even twiddle buffer size.
+    if(params.transform_type == fft_transform_type_real_forward
+       || params.transform_type == fft_transform_type_real_inverse)
+    {
+        const auto realdim = params.length.back();
+        if(realdim % 2 == 0)
+        {
+            const auto complex_size = params.precision == fft_precision_single ? 8 : 16;
+            // even length twiddle size is 1/4 of the real size, but
+            // in complex elements
+            vram_footprint += realdim * complex_size / 4;
+        }
+    }
+
+    return vram_footprint;
+}
+
+#endif
diff --git a/shared/fftw_transform.h b/shared/fftw_transform.h
new file mode 100644
index 0000000..873a373
--- /dev/null
+++ b/shared/fftw_transform.h
@@ -0,0 +1,493 @@
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+#ifndef FFTWTRANSFORM_H
+#define FFTWTRANSFORM_H
+
+#include "hostbuf.h"
+#include "rocfft_complex.h"
+#include "test_params.h"
+#include <fftw3.h>
+#include <vector>
+
+// Function to return maximum error for float and double types.
+//
+// Following Schatzman (1996; Accuracy of the Discrete Fourier
+// Transform and the Fast Fourier Transform), the shape of relative
+// l_2 error vs length should look like
+//
+//   epsilon * sqrt(log2(length)).
+//
+// The magic epsilon constants below were chosen so that we get a
+// reasonable upper bound for (all of) our tests.
+//
+// For rocFFT, prime lengths result in the highest error.  As such,
+// the epsilons below are perhaps too loose for pow2 lengths; but they
+// are appropriate for prime lengths.
+template <typename Tfloat>
+inline double type_epsilon();
+template <>
+inline double type_epsilon<_Float16>()
+{
+    return half_epsilon;
+}
+template <>
+inline double type_epsilon<float>()
+{
+    return single_epsilon;
+}
+template <>
+inline double type_epsilon<double>()
+{
+    return double_epsilon;
+}
+
+// C++ traits to translate float->fftwf_complex and
+// double->fftw_complex.
+// The correct FFTW complex type can be accessed via, for example,
+// using complex_t = typename fftw_complex_trait<Tfloat>::complex_t;
+template <typename Tfloat>
+struct fftw_trait;
+template <>
+struct fftw_trait<_Float16>
+{
+    // fftw does not support half precision, so use single precision and convert
+    using fftw_complex_type = fftwf_complex;
+    using fftw_plan_type    = fftwf_plan;
+};
+template <>
+struct fftw_trait<float>
+{
+    using fftw_complex_type = fftwf_complex;
+    using fftw_plan_type    = fftwf_plan;
+};
+template <>
+struct fftw_trait<double>
+{
+    using fftw_complex_type = fftw_complex;
+    using fftw_plan_type    = fftw_plan;
+};
+
+// Copies the half-precision input buffer to a single-precision
+// buffer.  Note that the input buffer is already sized like it's a
+// single-precision buffer (but only half of it is filled), because
+// we allocate a single-precision buffer for FFTW to plan with.
+static hostbuf half_to_single_copy(const hostbuf& in)
+{
+    auto out      = in.copy();
+    auto in_begin = reinterpret_cast<const _Float16*>(in.data());
+    std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast<float*>(out.data()));
+    return out;
+}
+
+// converts a wider precision buffer to a narrower precision, in-place
+template <typename TfloatIn, typename TfloatOut>
+void narrow_precision_inplace(hostbuf& in)
+{
+    // ensure we're actually shrinking the data
+    static_assert(sizeof(TfloatIn) > sizeof(TfloatOut));
+
+    auto readPtr  = reinterpret_cast<const TfloatIn*>(in.data());
+    auto writePtr = reinterpret_cast<TfloatOut*>(in.data());
+    std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr);
+    in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut)));
+}
+
+static void single_to_half_inplace(hostbuf& in)
+{
+    narrow_precision_inplace<float, _Float16>(in);
+}
+
+// Template wrappers for real-valued FFTW allocators:
+template <typename Tfloat>
+inline Tfloat* fftw_alloc_real_type(size_t n);
+template <>
+inline float* fftw_alloc_real_type<float>(size_t n)
+{
+    return fftwf_alloc_real(n);
+}
+template <>
+inline double* fftw_alloc_real_type<double>(size_t n)
+{
+    return fftw_alloc_real(n);
+}
+
+// Template wrappers for complex-valued FFTW allocators:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_complex_type* fftw_alloc_complex_type(size_t n);
+template <>
+inline typename fftw_trait<float>::fftw_complex_type* fftw_alloc_complex_type<float>(size_t n)
+{
+    return fftwf_alloc_complex(n);
+}
+template <>
+inline typename fftw_trait<double>::fftw_complex_type* fftw_alloc_complex_type<double>(size_t n)
+{
+    return fftw_alloc_complex(n);
+}
+
+template <typename fftw_type>
+inline fftw_type* fftw_alloc_type(size_t n);
+template <>
+inline float* fftw_alloc_type<float>(size_t n)
+{
+    return fftw_alloc_real_type<float>(n);
+}
+template <>
+inline double* fftw_alloc_type<double>(size_t n)
+{
+    return fftw_alloc_real_type<double>(n);
+}
+template <>
+inline fftwf_complex* fftw_alloc_type<fftwf_complex>(size_t n)
+{
+    return fftw_alloc_complex_type<float>(n);
+}
+template <>
+inline fftw_complex* fftw_alloc_type<fftw_complex>(size_t n)
+{
+    return fftw_alloc_complex_type<double>(n);
+}
+template <>
+inline rocfft_complex<float>* fftw_alloc_type<rocfft_complex<float>>(size_t n)
+{
+    return (rocfft_complex<float>*)fftw_alloc_complex_type<float>(n);
+}
+template <>
+inline rocfft_complex<double>* fftw_alloc_type<rocfft_complex<double>>(size_t n)
+{
+    return (rocfft_complex<double>*)fftw_alloc_complex_type<double>(n);
+}
+
+// Template wrappers for FFTW plan executors:
+template <typename Tfloat>
+inline void fftw_execute_type(typename fftw_trait<Tfloat>::fftw_plan_type plan);
+template <>
+inline void fftw_execute_type<float>(typename fftw_trait<float>::fftw_plan_type plan)
+{
+    return fftwf_execute(plan);
+}
+template <>
+inline void fftw_execute_type<double>(typename fftw_trait<double>::fftw_plan_type plan)
+{
+    return fftw_execute(plan);
+}
+
+// Template wrappers for FFTW plan destroyers:
+template <typename Tfftw_plan>
+inline void fftw_destroy_plan_type(Tfftw_plan plan);
+template <>
+inline void fftw_destroy_plan_type<fftwf_plan>(fftwf_plan plan)
+{
+    return fftwf_destroy_plan(plan);
+}
+template <>
+inline void fftw_destroy_plan_type<fftw_plan>(fftw_plan plan)
+{
+    return fftw_destroy_plan(plan);
+}
+
+// Template wrappers for FFTW c2c planners:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_plan_type
+    fftw_plan_guru64_dft(int                                             rank,
+                         const fftw_iodim64*                             dims,
+                         int                                             howmany_rank,
+                         const fftw_iodim64*                             howmany_dims,
+                         typename fftw_trait<Tfloat>::fftw_complex_type* in,
+                         typename fftw_trait<Tfloat>::fftw_complex_type* out,
+                         int                                             sign,
+                         unsigned                                        flags);
+
+template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+    fftw_plan_guru64_dft<_Float16>(int                                               rank,
+                                   const fftw_iodim64*                               dims,
+                                   int                                               howmany_rank,
+                                   const fftw_iodim64*                               howmany_dims,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* in,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* out,
+                                   int                                               sign,
+                                   unsigned                                          flags)
+{
+    return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
+}
+
+template <>
+inline typename fftw_trait<float>::fftw_plan_type
+    fftw_plan_guru64_dft<float>(int                                            rank,
+                                const fftw_iodim64*                            dims,
+                                int                                            howmany_rank,
+                                const fftw_iodim64*                            howmany_dims,
+                                typename fftw_trait<float>::fftw_complex_type* in,
+                                typename fftw_trait<float>::fftw_complex_type* out,
+                                int                                            sign,
+                                unsigned                                       flags)
+{
+    return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
+}
+
+template <>
+inline typename fftw_trait<double>::fftw_plan_type
+    fftw_plan_guru64_dft<double>(int                                             rank,
+                                 const fftw_iodim64*                             dims,
+                                 int                                             howmany_rank,
+                                 const fftw_iodim64*                             howmany_dims,
+                                 typename fftw_trait<double>::fftw_complex_type* in,
+                                 typename fftw_trait<double>::fftw_complex_type* out,
+                                 int                                             sign,
+                                 unsigned                                        flags)
+{
+    return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
+}
+
+// Template wrappers for FFTW c2c executors:
+template <typename Tfloat>
+inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+                                  std::vector<hostbuf>&                       in,
+                                  std::vector<hostbuf>&                       out);
+
+template <>
+inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan,
+                                            std::vector<hostbuf>&                         in,
+                                            std::vector<hostbuf>&                         out)
+{
+    // since FFTW does not natively support half precision, convert
+    // input to single, execute, then convert output back to half
+    auto in_single = half_to_single_copy(in.front());
+    fftwf_execute_dft(plan,
+                      reinterpret_cast<fftwf_complex*>(in_single.data()),
+                      reinterpret_cast<fftwf_complex*>(out.front().data()));
+    single_to_half_inplace(out.front());
+}
+
+template <>
+inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
+                                         std::vector<hostbuf>&                      in,
+                                         std::vector<hostbuf>&                      out)
+{
+    fftwf_execute_dft(plan,
+                      reinterpret_cast<fftwf_complex*>(in.front().data()),
+                      reinterpret_cast<fftwf_complex*>(out.front().data()));
+}
+
+template <>
+inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
+                                          std::vector<hostbuf>&                       in,
+                                          std::vector<hostbuf>&                       out)
+{
+    fftw_execute_dft(plan,
+                     reinterpret_cast<fftw_complex*>(in.front().data()),
+                     reinterpret_cast<fftw_complex*>(out.front().data()));
+}
+
+// Template wrappers for FFTW r2c planners:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_plan_type
+    fftw_plan_guru64_r2c(int                                             rank,
+                         const fftw_iodim64*                             dims,
+                         int                                             howmany_rank,
+                         const fftw_iodim64*                             howmany_dims,
+                         Tfloat*                                         in,
+                         typename fftw_trait<Tfloat>::fftw_complex_type* out,
+                         unsigned                                        flags);
+template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+    fftw_plan_guru64_r2c<_Float16>(int                                               rank,
+                                   const fftw_iodim64*                               dims,
+                                   int                                               howmany_rank,
+                                   const fftw_iodim64*                               howmany_dims,
+                                   _Float16*                                         in,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* out,
+                                   unsigned                                          flags)
+{
+    return fftwf_plan_guru64_dft_r2c(
+        rank, dims, howmany_rank, howmany_dims, reinterpret_cast<float*>(in), out, flags);
+}
+template <>
+inline typename fftw_trait<float>::fftw_plan_type
+    fftw_plan_guru64_r2c<float>(int                                            rank,
+                                const fftw_iodim64*                            dims,
+                                int                                            howmany_rank,
+                                const fftw_iodim64*                            howmany_dims,
+                                float*                                         in,
+                                typename fftw_trait<float>::fftw_complex_type* out,
+                                unsigned                                       flags)
+{
+    return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+template <>
+inline typename fftw_trait<double>::fftw_plan_type
+    fftw_plan_guru64_r2c<double>(int                                             rank,
+                                 const fftw_iodim64*                             dims,
+                                 int                                             howmany_rank,
+                                 const fftw_iodim64*                             howmany_dims,
+                                 double*                                         in,
+                                 typename fftw_trait<double>::fftw_complex_type* out,
+                                 unsigned                                        flags)
+{
+    return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+
+// Template wrappers for FFTW r2c executors:
+template <typename Tfloat>
+inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+                                  std::vector<hostbuf>&                       in,
+                                  std::vector<hostbuf>&                       out);
+template <>
+inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
+                                            std::vector<hostbuf>&                      in,
+                                            std::vector<hostbuf>&                      out)
+{
+    // since FFTW does not natively support half precision, convert
+    // input to single, execute, then convert output back to half
+    auto in_single = half_to_single_copy(in.front());
+    fftwf_execute_dft_r2c(plan,
+                          reinterpret_cast<float*>(in_single.data()),
+                          reinterpret_cast<fftwf_complex*>(out.front().data()));
+    single_to_half_inplace(out.front());
+}
+template <>
+inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
+                                         std::vector<hostbuf>&                      in,
+                                         std::vector<hostbuf>&                      out)
+{
+    fftwf_execute_dft_r2c(plan,
+                          reinterpret_cast<float*>(in.front().data()),
+                          reinterpret_cast<fftwf_complex*>(out.front().data()));
+}
+template <>
+inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
+                                          std::vector<hostbuf>&                       in,
+                                          std::vector<hostbuf>&                       out)
+{
+    fftw_execute_dft_r2c(plan,
+                         reinterpret_cast<double*>(in.front().data()),
+                         reinterpret_cast<fftw_complex*>(out.front().data()));
+}
+
+// Template wrappers for FFTW c2r planners:
+template <typename Tfloat>
+inline typename fftw_trait<Tfloat>::fftw_plan_type
+    fftw_plan_guru64_c2r(int                                             rank,
+                         const fftw_iodim64*                             dims,
+                         int                                             howmany_rank,
+                         const fftw_iodim64*                             howmany_dims,
+                         typename fftw_trait<Tfloat>::fftw_complex_type* in,
+                         Tfloat*                                         out,
+                         unsigned                                        flags);
+template <>
+inline typename fftw_trait<_Float16>::fftw_plan_type
+    fftw_plan_guru64_c2r<_Float16>(int                                               rank,
+                                   const fftw_iodim64*                               dims,
+                                   int                                               howmany_rank,
+                                   const fftw_iodim64*                               howmany_dims,
+                                   typename fftw_trait<_Float16>::fftw_complex_type* in,
+                                   _Float16*                                         out,
+                                   unsigned                                          flags)
+{
+    return fftwf_plan_guru64_dft_c2r(
+        rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast<float*>(out), flags);
+}
+template <>
+inline typename fftw_trait<float>::fftw_plan_type
+    fftw_plan_guru64_c2r<float>(int                                            rank,
+                                const fftw_iodim64*                            dims,
+                                int                                            howmany_rank,
+                                const fftw_iodim64*                            howmany_dims,
+                                typename fftw_trait<float>::fftw_complex_type* in,
+                                float*                                         out,
+                                unsigned                                       flags)
+{
+    return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+template <>
+inline typename fftw_trait<double>::fftw_plan_type
+    fftw_plan_guru64_c2r<double>(int                                             rank,
+                                 const fftw_iodim64*                             dims,
+                                 int                                             howmany_rank,
+                                 const fftw_iodim64*                             howmany_dims,
+                                 typename fftw_trait<double>::fftw_complex_type* in,
+                                 double*                                         out,
+                                 unsigned                                        flags)
+{
+    return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
+}
+
+// Template wrappers for FFTW c2r executors:
+template <typename Tfloat>
+inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type plan,
+                                  std::vector<hostbuf>&                       in,
+                                  std::vector<hostbuf>&                       out);
+template <>
+inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
+                                            std::vector<hostbuf>&                      in,
+                                            std::vector<hostbuf>&                      out)
+{
+    // since FFTW does not natively support half precision, convert
+    // input to single, execute, then convert output back to half
+    auto in_single = half_to_single_copy(in.front());
+    fftwf_execute_dft_c2r(plan,
+                          reinterpret_cast<fftwf_complex*>(in_single.data()),
+                          reinterpret_cast<float*>(out.front().data()));
+    single_to_half_inplace(out.front());
+}
+template <>
+inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type plan,
+                                         std::vector<hostbuf>&                      in,
+                                         std::vector<hostbuf>&                      out)
+{
+    fftwf_execute_dft_c2r(plan,
+                          reinterpret_cast<fftwf_complex*>(in.front().data()),
+                          reinterpret_cast<float*>(out.front().data()));
+}
+template <>
+inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type plan,
+                                          std::vector<hostbuf>&                       in,
+                                          std::vector<hostbuf>&                       out)
+{
+    fftw_execute_dft_c2r(plan,
+                         reinterpret_cast<fftw_complex*>(in.front().data()),
+                         reinterpret_cast<double*>(out.front().data()));
+}
+
+#ifdef FFTW_HAVE_SPRINT_PLAN
+// Template wrappers for FFTW print plan:
+template <typename Tfloat>
+inline char* fftw_sprint_plan(const typename fftw_trait<Tfloat>::fftw_plan_type plan);
+template <>
+inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan)
+{
+    return fftwf_sprint_plan(plan);
+}
+template <>
+inline char* fftw_sprint_plan<float>(const typename fftw_trait<float>::fftw_plan_type plan)
+{
+    return fftwf_sprint_plan(plan);
+}
+template <>
+inline char* fftw_sprint_plan<double>(const typename fftw_trait<double>::fftw_plan_type plan)
+{
+    return fftw_sprint_plan(plan);
+}
+#endif
+
+#endif
diff --git a/shared/gpubuf.h b/shared/gpubuf.h
new file mode 100644
index 0000000..993fa95
--- /dev/null
+++ b/shared/gpubuf.h
@@ -0,0 +1,134 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_GPUBUF_H
+#define ROCFFT_GPUBUF_H
+
+#include "rocfft_hip.h"
+#include <cstdlib>
+
+// Simple RAII class for GPU buffers.  T is the type of pointer that
+// data() returns
+template <class T = void>
+class gpubuf_t
+{
+public:
+    gpubuf_t() {}
+    // buffers are movable but not copyable
+    gpubuf_t(gpubuf_t&& other)
+    {
+        std::swap(buf, other.buf);
+        std::swap(bsize, other.bsize);
+        std::swap(device, other.device);
+    }
+    gpubuf_t& operator=(gpubuf_t&& other)
+    {
+        std::swap(buf, other.buf);
+        std::swap(bsize, other.bsize);
+        std::swap(device, other.device);
+        return *this;
+    }
+    gpubuf_t(const gpubuf_t&) = delete;
+    gpubuf_t& operator=(const gpubuf_t&) = delete;
+
+    ~gpubuf_t()
+    {
+        free();
+    }
+
+    static bool use_alloc_managed()
+    {
+        return std::getenv("ROCFFT_MALLOC_MANAGED");
+    }
+
+    hipError_t alloc(const size_t size)
+    {
+        // remember the device that was current as of alloc, so we can
+        // free on the correct device
+        auto ret = hipGetDevice(&device);
+        if(ret != hipSuccess)
+            return ret;
+
+        bsize                     = size;
+        static bool alloc_managed = use_alloc_managed();
+        free();
+        ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize);
+        if(ret != hipSuccess)
+        {
+            buf   = nullptr;
+            bsize = 0;
+        }
+        return ret;
+    }
+
+    size_t size() const
+    {
+        return bsize;
+    }
+
+    void free()
+    {
+        if(buf != nullptr)
+        {
+            // free on the device we allocated on
+            rocfft_scoped_device dev(device);
+            (void)hipFree(buf);
+            buf   = nullptr;
+            bsize = 0;
+        }
+    }
+
+    // return a pointer to the allocated memory, offset by the
+    // specified number of bytes
+    T* data_offset(size_t offset_bytes = 0) const
+    {
+        void* ptr = static_cast<char*>(buf) + offset_bytes;
+        return static_cast<T*>(ptr);
+    }
+
+    T* data() const
+    {
+        return static_cast<T*>(buf);
+    }
+
+    // equality/bool tests
+    bool operator==(std::nullptr_t n) const
+    {
+        return buf == n;
+    }
+    bool operator!=(std::nullptr_t n) const
+    {
+        return buf != n;
+    }
+    operator bool() const
+    {
+        return buf;
+    }
+
+private:
+    // The GPU buffer
+    void*  buf    = nullptr;
+    size_t bsize  = 0;
+    int    device = 0;
+};
+
+// default gpubuf that gives out void* pointers
+typedef gpubuf_t<> gpubuf;
+#endif
diff --git a/shared/hip_object_wrapper.h b/shared/hip_object_wrapper.h
new file mode 100644
index 0000000..54083ab
--- /dev/null
+++ b/shared/hip_object_wrapper.h
@@ -0,0 +1,86 @@
+/******************************************************************************
+* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*******************************************************************************/
+
+#ifndef ROCFFT_HIP_OBJ_WRAPPER_H
+#define ROCFFT_HIP_OBJ_WRAPPER_H
+
+#include "rocfft_hip.h"
+
+// RAII wrapper around HIP objects
+template <typename T, auto TCreate, auto TDestroy>
+struct hip_object_wrapper_t
+{
+    hip_object_wrapper_t()
+        : obj(nullptr)
+    {
+    }
+
+    void alloc()
+    {
+        if(obj == nullptr && TCreate(&obj) != hipSuccess)
+            throw std::runtime_error("hip create failure");
+    }
+
+    void free()
+    {
+        if(obj)
+        {
+            (void)TDestroy(obj);
+            obj = nullptr;
+        }
+    }
+
+    operator const T&() const
+    {
+        return obj;
+    }
+    operator T&()
+    {
+        return obj;
+    }
+
+    operator bool() const
+    {
+        return obj != nullptr;
+    }
+
+    ~hip_object_wrapper_t()
+    {
+        free();
+    }
+
+    hip_object_wrapper_t(const hip_object_wrapper_t&) = delete;
+    hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete;
+    hip_object_wrapper_t(hip_object_wrapper_t&& other)
+        : obj(other.obj)
+    {
+        other.obj = nullptr;
+    }
+
+private:
+    T obj;
+};
+
+typedef hip_object_wrapper_t<hipStream_t, hipStreamCreate, hipStreamDestroy> hipStream_wrapper_t;
+typedef hip_object_wrapper_t<hipEvent_t, hipEventCreate, hipEventDestroy>    hipEvent_wrapper_t;
+
+#endif // ROCFFT_HIP_OBJ_WRAPPER_H
diff --git a/shared/hostbuf.h b/shared/hostbuf.h
new file mode 100644
index 0000000..0a96c7d
--- /dev/null
+++ b/shared/hostbuf.h
@@ -0,0 +1,158 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_HOSTBUF_H
+#define ROCFFT_HOSTBUF_H
+
+#include "arithmetic.h"
+#include <cstdlib>
+#include <cstring>
+
+#ifndef WIN32
+#include <stdlib.h>
+#include <sys/mman.h>
+#endif
+
+// Simple RAII class for host buffers.  T is the type of pointer that
+// data() returns
+template <class T = void>
+class hostbuf_t
+{
+public:
+    hostbuf_t() {}
+    // buffers are movable but not copyable
+    hostbuf_t(hostbuf_t&& other)
+    {
+        std::swap(buf, other.buf);
+        std::swap(bsize, other.bsize);
+    }
+    hostbuf_t& operator=(hostbuf_t&& other)
+    {
+        std::swap(buf, other.buf);
+        std::swap(bsize, other.bsize);
+        return *this;
+    }
+    hostbuf_t(const hostbuf_t&) = delete;
+    hostbuf_t& operator=(const hostbuf_t&) = delete;
+
+    ~hostbuf_t()
+    {
+        free();
+    }
+
+    void alloc(size_t size)
+    {
+        bsize = size;
+        free();
+
+        // we're aligning to multiples of 64 bytes, so round the
+        // allocation size up to the nearest 64 to keep ASAN happy
+        if(size % 64)
+        {
+            size += 64 - size % 64;
+        }
+
+        // FFTW requires aligned allocations to use faster SIMD instructions.
+        // If enabling hugepages, align to 2 MiB. Otherwise, aligning to
+        // 64 bytes is enough for AVX instructions up to AVX512.
+#ifdef WIN32
+        buf = _aligned_malloc(size, 64);
+#else
+        // On Linux, ask for hugepages to reduce TLB pressure and
+        // improve performance.  Allocations need to be aligned to
+        // the hugepage size, and rounded up to the next whole
+        // hugepage.
+        static const size_t TWO_MiB = 2 * 1024 * 1024;
+        if(size >= TWO_MiB)
+        {
+            size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB;
+            buf                 = aligned_alloc(TWO_MiB, rounded_size);
+            madvise(buf, rounded_size, MADV_HUGEPAGE);
+        }
+        else
+            buf = aligned_alloc(64, size);
+#endif
+    }
+
+    size_t size() const
+    {
+        return bsize;
+    }
+
+    void free()
+    {
+        if(buf != nullptr)
+        {
+#ifdef WIN32
+            _aligned_free(buf);
+#else
+            std::free(buf);
+#endif
+            buf   = nullptr;
+            bsize = 0;
+        }
+    }
+
+    T* data() const
+    {
+        return static_cast<T*>(buf);
+    }
+
+    // Copy method
+    hostbuf_t copy() const
+    {
+        hostbuf_t copy;
+        copy.alloc(bsize);
+        memcpy(copy.buf, buf, bsize);
+        return copy;
+    }
+
+    // shrink the buffer to fit the new size
+    void shrink(size_t new_size)
+    {
+        if(new_size > bsize)
+            throw std::runtime_error("can't shrink hostbuf to larger size");
+        // just pretend the buffer is now that size
+        bsize = new_size;
+    }
+
+    // equality/bool tests
+    bool operator==(std::nullptr_t n) const
+    {
+        return buf == n;
+    }
+    bool operator!=(std::nullptr_t n) const
+    {
+        return buf != n;
+    }
+    operator bool() const
+    {
+        return buf;
+    }
+
+private:
+    // The host buffer
+    void*  buf   = nullptr;
+    size_t bsize = 0;
+};
+
+// default hostbuf that gives out void* pointers
+typedef hostbuf_t<> hostbuf;
+#endif
diff --git a/shared/increment.h b/shared/increment.h
new file mode 100644
index 0000000..90bba1d
--- /dev/null
+++ b/shared/increment.h
@@ -0,0 +1,100 @@
+// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_INCREMENT_H
+#define ROCFFT_INCREMENT_H
+
+#include <algorithm>
+#include <tuple>
+#include <vector>
+
+// Helper functions to iterate over a buffer in row-major order.
+// Indexes may be given as either a tuple or vector of sizes.  They
+// return true if the index was successfully incremented to move to
+// the next element in the buffer.
+
+template <typename T1, typename T2>
+static bool increment_base(T1& index, const T2& length)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+    static_assert(std::is_integral<T2>::value, "Integral required.");
+    if(index < length - 1)
+    {
+        ++index;
+        return true;
+    }
+    index = 0;
+    return false;
+}
+
+// Increment the index (row-major) for looping over 1, 2, and 3 dimensions length.
+template <typename T1, typename T2>
+static bool increment_rowmajor(T1& index, const T2& length)
+{
+    static_assert(std::is_integral<T1>::value, "Integral required.");
+    static_assert(std::is_integral<T2>::value, "Integral required.");
+    return increment_base(index, length);
+}
+
+template <typename T1, typename T2>
+static bool increment_rowmajor(std::tuple<T1, T1>& index, const std::tuple<T2, T2>& length)
+{
+    if(increment_base(std::get<1>(index), std::get<1>(length)))
+        // we incremented ok, nothing further to do
+        return true;
+    // otherwise, we rolled over
+    return increment_base(std::get<0>(index), std::get<0>(length));
+}
+
+template <typename T1, typename T2>
+static bool increment_rowmajor(std::tuple<T1, T1, T1>& index, const std::tuple<T2, T2, T2>& length)
+{
+    if(increment_base(std::get<2>(index), std::get<2>(length)))
+        // we incremented ok, nothing further to do
+        return true;
+    if(increment_base(std::get<1>(index), std::get<1>(length)))
+        // we incremented ok, nothing further to do
+        return true;
+    // otherwise, we rolled over
+    return increment_base(std::get<0>(index), std::get<0>(length));
+}
+
+// Increment row-major index over arbitrary dimension length
+template <typename T1, typename T2>
+bool increment_rowmajor(std::vector<T1>& index, const std::vector<T2>& length)
+{
+    for(int idim = length.size(); idim-- > 0;)
+    {
+        if(index[idim] < length[idim])
+        {
+            if((++index[idim]) == length[idim])
+            {
+                index[idim] = 0;
+                continue;
+            }
+            // we know we were able to increment something and didn't hit the end
+            return true;
+        }
+    }
+    // End the loop when we get back to the start:
+    return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; });
+}
+
+#endif
diff --git a/shared/precision_type.h b/shared/precision_type.h
new file mode 100644
index 0000000..526fc9a
--- /dev/null
+++ b/shared/precision_type.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_PRECISION_TYPE_H
+#define ROCFFT_PRECISION_TYPE_H
+
+#include "array_predicate.h"
+#include "rocfft/rocfft.h"
+
+static size_t real_type_size(rocfft_precision precision)
+{
+    switch(precision)
+    {
+    case rocfft_precision_half:
+        return 2;
+    case rocfft_precision_single:
+        return 4;
+    case rocfft_precision_double:
+        return 8;
+    }
+}
+
+static size_t complex_type_size(rocfft_precision precision)
+{
+    return real_type_size(precision) * 2;
+}
+
+static const char* precision_name(rocfft_precision precision)
+{
+    switch(precision)
+    {
+    case rocfft_precision_half:
+        return "half";
+    case rocfft_precision_single:
+        return "single";
+    case rocfft_precision_double:
+        return "double";
+    }
+}
+
+static size_t element_size(rocfft_precision precision, rocfft_array_type array_type)
+{
+    return array_type_is_complex(array_type) ? complex_type_size(precision)
+                                             : real_type_size(precision);
+}
+
+// offset a pointer by a number of elements, given the elements'
+// precision and type (complex or not)
+static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type)
+{
+    return static_cast<char*>(p) + elems * element_size(precision, type);
+}
+#endif
diff --git a/shared/printbuffer.h b/shared/printbuffer.h
new file mode 100644
index 0000000..5ae0b64
--- /dev/null
+++ b/shared/printbuffer.h
@@ -0,0 +1,108 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef PRINTBUFFER_H
+#define PRINTBUFFER_H
+
+#include "hostbuf.h"
+#include "increment.h"
+#include <algorithm>
+#include <vector>
+
+// Output a formatted general-dimensional array with given length and stride in batches
+// separated by dist.
+template <typename Toutput, typename T1, typename T2, typename Tsize, typename Tstream>
+inline void printbuffer(const Toutput*         output,
+                        const std::vector<T1>& length,
+                        const std::vector<T2>& stride,
+                        const Tsize            nbatch,
+                        const Tsize            dist,
+                        const size_t           offset,
+                        Tstream&               stream)
+{
+    auto i_base = 0;
+    for(unsigned int b = 0; b < nbatch; b++, i_base += dist)
+    {
+        std::vector<size_t> index(length.size());
+        std::fill(index.begin(), index.end(), 0);
+        do
+        {
+            const int i
+                = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset);
+            stream << output[i] << " ";
+            for(int li = index.size(); li-- > 0;)
+            {
+                if(index[li] == (length[li] - 1))
+                {
+                    stream << "\n";
+                }
+                else
+                {
+                    break;
+                }
+            }
+        } while(increment_rowmajor(index, length));
+        stream << std::endl;
+    }
+}
+
+template <typename Telem>
+class buffer_printer
+{
+    // The scalar versions might be part of a planar format.
+public:
+    template <typename Tint1, typename Tint2, typename Tsize, typename Tstream = std::ostream>
+    static void print_buffer(const std::vector<hostbuf>& buf,
+                             const std::vector<Tint1>&   length,
+                             const std::vector<Tint2>&   stride,
+                             const Tsize                 nbatch,
+                             const Tsize                 dist,
+                             const std::vector<size_t>&  offset,
+                             Tstream&                    stream = std::cout)
+    {
+        for(const auto& vec : buf)
+        {
+            printbuffer(reinterpret_cast<const Telem*>(vec.data()),
+                        length,
+                        stride,
+                        nbatch,
+                        dist,
+                        offset[0],
+                        stream);
+        }
+    };
+    template <typename Tstream = std::ostream>
+    static void print_buffer_flat(const std::vector<hostbuf>& buf,
+                                  const std::vector<size_t>&  size,
+                                  const std::vector<size_t>&  offset,
+                                  Tstream&                    stream = std::cout)
+    {
+        for(const auto& vec : buf)
+        {
+            auto data = reinterpret_cast<const Telem*>(vec.data());
+            stream << "idx " << 0;
+            for(size_t i = 0; i < size[0]; ++i)
+                stream << " " << data[i];
+            stream << std::endl;
+        }
+    };
+};
+
+#endif
diff --git a/shared/ptrdiff.h b/shared/ptrdiff.h
new file mode 100644
index 0000000..3bd15de
--- /dev/null
+++ b/shared/ptrdiff.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+// Compute the farthest point from the original pointer.
+static size_t compute_ptrdiff(const std::vector<size_t>& length,
+                              const std::vector<size_t>& stride,
+                              const size_t               nbatch,
+                              const size_t               dist)
+{
+    size_t val = 0;
+    if(!length.empty())
+    {
+        val = 1;
+        for(unsigned int i = 0; i < length.size(); ++i)
+        {
+            val += (length[i] - 1) * stride[i];
+        }
+        val += (nbatch - 1) * dist;
+    }
+    return val;
+}
diff --git a/shared/rocfft_accuracy_test.h b/shared/rocfft_accuracy_test.h
new file mode 100644
index 0000000..4ce3059
--- /dev/null
+++ b/shared/rocfft_accuracy_test.h
@@ -0,0 +1,29 @@
+// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_ACCURACY_TEST
+#define ROCFFT_ACCURACY_TEST
+
+#include "accuracy_test.h"
+#include "rocfft_params.h"
+
+void fft_vs_reference(rocfft_params& params, bool round_trip = false);
+
+#endif
diff --git a/shared/rocfft_against_fftw.h b/shared/rocfft_against_fftw.h
new file mode 100644
index 0000000..d03754c
--- /dev/null
+++ b/shared/rocfft_against_fftw.h
@@ -0,0 +1,231 @@
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+#ifndef ROCFFT_AGAINST_FFTW
+#define ROCFFT_AGAINST_FFTW
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdexcept>
+#include <vector>
+
+#include "fftw_transform.h"
+
+// Return the precision enum for rocFFT based upon the type.
+template <typename Tfloat>
+inline fft_precision precision_selector();
+template <>
+inline fft_precision precision_selector<float>()
+{
+    return fft_precision_single;
+}
+template <>
+inline fft_precision precision_selector<double>()
+{
+    return fft_precision_double;
+}
+
+extern bool use_fftw_wisdom;
+
+// construct and return an FFTW plan with the specified type,
+// precision, and dimensions.  cpu_out is required if we're using
+// wisdom, which runs actual FFTs to work out the best plan.
+template <typename Tfloat>
+static typename fftw_trait<Tfloat>::fftw_plan_type
+    fftw_plan_with_precision(const std::vector<fftw_iodim64>& dims,
+                             const std::vector<fftw_iodim64>& howmany_dims,
+                             const fft_transform_type         transformType,
+                             const size_t                     isize,
+                             void*                            cpu_in,
+                             void*                            cpu_out)
+{
+    using fftw_complex_type = typename fftw_trait<Tfloat>::fftw_complex_type;
+
+    // NB: Using FFTW_MEASURE implies that the input buffer's data
+    // may be destroyed during plan creation.  But if we're wanting
+    // to run FFTW in the first place, we must have just created an
+    // uninitialized input buffer anyway.
+
+    switch(transformType)
+    {
+    case fft_transform_type_complex_forward:
+        return fftw_plan_guru64_dft<Tfloat>(dims.size(),
+                                            dims.data(),
+                                            howmany_dims.size(),
+                                            howmany_dims.data(),
+                                            reinterpret_cast<fftw_complex_type*>(cpu_in),
+                                            reinterpret_cast<fftw_complex_type*>(cpu_out),
+                                            -1,
+                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+    case fft_transform_type_complex_inverse:
+        return fftw_plan_guru64_dft<Tfloat>(dims.size(),
+                                            dims.data(),
+                                            howmany_dims.size(),
+                                            howmany_dims.data(),
+                                            reinterpret_cast<fftw_complex_type*>(cpu_in),
+                                            reinterpret_cast<fftw_complex_type*>(cpu_out),
+                                            1,
+                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+    case fft_transform_type_real_forward:
+        return fftw_plan_guru64_r2c<Tfloat>(dims.size(),
+                                            dims.data(),
+                                            howmany_dims.size(),
+                                            howmany_dims.data(),
+                                            reinterpret_cast<Tfloat*>(cpu_in),
+                                            reinterpret_cast<fftw_complex_type*>(cpu_out),
+                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+    case fft_transform_type_real_inverse:
+        return fftw_plan_guru64_c2r<Tfloat>(dims.size(),
+                                            dims.data(),
+                                            howmany_dims.size(),
+                                            howmany_dims.data(),
+                                            reinterpret_cast<fftw_complex_type*>(cpu_in),
+                                            reinterpret_cast<Tfloat*>(cpu_out),
+                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
+    default:
+        throw std::runtime_error("Invalid transform type");
+    }
+}
+
+// construct an FFTW plan, given rocFFT parameters.  output is
+// required if planning with wisdom.
+template <typename Tfloat>
+static typename fftw_trait<Tfloat>::fftw_plan_type
+    fftw_plan_via_rocfft(const std::vector<size_t>& length,
+                         const std::vector<size_t>& istride,
+                         const std::vector<size_t>& ostride,
+                         const size_t               nbatch,
+                         const size_t               idist,
+                         const size_t               odist,
+                         const fft_transform_type   transformType,
+                         std::vector<hostbuf>&      input,
+                         std::vector<hostbuf>&      output)
+{
+    // Dimension configuration:
+    std::vector<fftw_iodim64> dims(length.size());
+    for(unsigned int idx = 0; idx < length.size(); ++idx)
+    {
+        dims[idx].n  = length[idx];
+        dims[idx].is = istride[idx];
+        dims[idx].os = ostride[idx];
+    }
+
+    // Batch configuration:
+    std::vector<fftw_iodim64> howmany_dims(1);
+    howmany_dims[0].n  = nbatch;
+    howmany_dims[0].is = idist;
+    howmany_dims[0].os = odist;
+
+    return fftw_plan_with_precision<Tfloat>(dims,
+                                            howmany_dims,
+                                            transformType,
+                                            idist * nbatch,
+                                            input.front().data(),
+                                            output.empty() ? nullptr : output.front().data());
+}
+
+template <typename Tfloat>
+void fftw_run(fft_transform_type                          transformType,
+              typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan,
+              std::vector<hostbuf>&                       cpu_in,
+              std::vector<hostbuf>&                       cpu_out)
+{
+    switch(transformType)
+    {
+    case fft_transform_type_complex_forward:
+    {
+        fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
+        break;
+    }
+    case fft_transform_type_complex_inverse:
+    {
+        fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
+        break;
+    }
+    case fft_transform_type_real_forward:
+    {
+        fftw_plan_execute_r2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
+        break;
+    }
+    case fft_transform_type_real_inverse:
+    {
+        fftw_plan_execute_c2r<Tfloat>(cpu_plan, cpu_in, cpu_out);
+        break;
+    }
+    }
+}
+
+// Given a transform type, return the contiguous input type.
+inline fft_array_type contiguous_itype(const fft_transform_type transformType)
+{
+    switch(transformType)
+    {
+    case fft_transform_type_complex_forward:
+    case fft_transform_type_complex_inverse:
+        return fft_array_type_complex_interleaved;
+    case fft_transform_type_real_forward:
+        return fft_array_type_real;
+    case fft_transform_type_real_inverse:
+        return fft_array_type_hermitian_interleaved;
+    default:
+        throw std::runtime_error("Invalid transform type");
+    }
+    return fft_array_type_complex_interleaved;
+}
+
+// Given a transform type, return the contiguous output type.
+inline fft_array_type contiguous_otype(const fft_transform_type transformType)
+{
+    switch(transformType)
+    {
+    case fft_transform_type_complex_forward:
+    case fft_transform_type_complex_inverse:
+        return fft_array_type_complex_interleaved;
+    case fft_transform_type_real_forward:
+        return fft_array_type_hermitian_interleaved;
+    case fft_transform_type_real_inverse:
+        return fft_array_type_real;
+    default:
+        throw std::runtime_error("Invalid transform type");
+    }
+    return fft_array_type_complex_interleaved;
+}
+
+// Given a precision, return the acceptable tolerance.
+inline double type_epsilon(const fft_precision precision)
+{
+    switch(precision)
+    {
+    case fft_precision_half:
+        return type_epsilon<_Float16>();
+        break;
+    case fft_precision_single:
+        return type_epsilon<float>();
+        break;
+    case fft_precision_double:
+        return type_epsilon<double>();
+        break;
+    default:
+        throw std::runtime_error("Invalid precision");
+    }
+}
+
+#endif
diff --git a/shared/rocfft_complex.h b/shared/rocfft_complex.h
new file mode 100644
index 0000000..efa0290
--- /dev/null
+++ b/shared/rocfft_complex.h
@@ -0,0 +1,346 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_COMPLEX_H
+#define ROCFFT_COMPLEX_H
+
+#include <hip/hip_fp16.h>
+#if !defined(__HIPCC_RTC__)
+#include <iostream>
+#endif
+#include <math.h>
+#include <type_traits>
+
+#ifdef __HIP_PLATFORM_NVIDIA__
+typedef __half _Float16;
+#endif
+
+template <typename Treal>
+struct rocfft_complex
+{
+
+    Treal x; // Real part
+    Treal y; // Imaginary part
+
+    // Constructors
+    // Do not initialize the members x or y by default, to ensure that it can
+    // be used in __shared__ and that it is a trivial class compatible with C.
+    __device__ __host__ rocfft_complex()                      = default;
+    __device__ __host__ rocfft_complex(const rocfft_complex&) = default;
+    __device__ __host__ rocfft_complex(rocfft_complex&&)      = default;
+    __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default;
+    __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default;
+    __device__                          __host__ ~rocfft_complex()        = default;
+
+    // Constructor from real and imaginary parts
+    __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag)
+        : x{real}
+        , y{imag}
+    {
+    }
+
+    // Conversion from different precision
+    template <typename U>
+    __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z)
+        : x(z.x)
+        , y(z.y)
+    {
+    }
+
+    // Accessors
+    __device__ __host__ constexpr Treal real() const
+    {
+        return x;
+    }
+
+    __device__ __host__ constexpr Treal imag() const
+    {
+        return y;
+    }
+
+    // Unary operations
+    __forceinline__ __device__ __host__ rocfft_complex operator-() const
+    {
+        return {-x, -y};
+    }
+
+    __forceinline__ __device__ __host__ rocfft_complex operator+() const
+    {
+        return *this;
+    }
+
+    __device__ __host__ Treal asum(const rocfft_complex& z)
+    {
+        return abs(z.x) + abs(z.y);
+    }
+
+    // Internal real functions
+    static __forceinline__ __device__ __host__ Treal abs(Treal x)
+    {
+        return x < 0 ? -x : x;
+    }
+
+    static __forceinline__ __device__ __host__ float sqrt(float x)
+    {
+        return ::sqrtf(x);
+    }
+
+    static __forceinline__ __device__ __host__ double sqrt(double x)
+    {
+        return ::sqrt(x);
+    }
+
+    // Addition operators
+    __device__ __host__ auto& operator+=(const rocfft_complex& rhs)
+    {
+        return *this = {x + rhs.x, y + rhs.y};
+    }
+
+    __device__ __host__ auto operator+(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs += rhs;
+    }
+
+    // Subtraction operators
+    __device__ __host__ auto& operator-=(const rocfft_complex& rhs)
+    {
+        return *this = {x - rhs.x, y - rhs.y};
+    }
+
+    __device__ __host__ auto operator-(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs -= rhs;
+    }
+
+    // Multiplication operators
+    __device__ __host__ auto& operator*=(const rocfft_complex& rhs)
+    {
+        return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y};
+    }
+
+    __device__ __host__ auto operator*(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs *= rhs;
+    }
+
+    // Division operators
+    __device__ __host__ auto& operator/=(const rocfft_complex& rhs)
+    {
+        // Form of Robert L. Smith's Algorithm 116
+        if(abs(rhs.x) > abs(rhs.y))
+        {
+            Treal ratio = rhs.y / rhs.x;
+            Treal scale = 1 / (rhs.x + rhs.y * ratio);
+            *this       = {(x + y * ratio) * scale, (y - x * ratio) * scale};
+        }
+        else
+        {
+            Treal ratio = rhs.x / rhs.y;
+            Treal scale = 1 / (rhs.x * ratio + rhs.y);
+            *this       = {(y + x * ratio) * scale, (y * ratio - x) * scale};
+        }
+        return *this;
+    }
+
+    __device__ __host__ auto operator/(const rocfft_complex& rhs) const
+    {
+        auto lhs = *this;
+        return lhs /= rhs;
+    }
+
+    // Comparison operators
+    __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const
+    {
+        return x == rhs.x && y == rhs.y;
+    }
+
+    __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    // Operators for complex-real computations
+    template <typename U>
+    __device__ __host__ auto& operator+=(const U& rhs)
+    {
+        return (x += Treal(rhs)), *this;
+    }
+
+    template <typename U>
+    __device__ __host__ auto& operator-=(const U& rhs)
+    {
+        return (x -= Treal(rhs)), *this;
+    }
+
+    __device__ __host__ auto operator+(const Treal& rhs)
+    {
+        auto lhs = *this;
+        return lhs += rhs;
+    }
+
+    __device__ __host__ auto operator-(const Treal& rhs)
+    {
+        auto lhs = *this;
+        return lhs -= rhs;
+    }
+
+    template <typename U>
+    __device__ __host__ auto& operator*=(const U& rhs)
+    {
+        return (x *= Treal(rhs)), (y *= Treal(rhs)), *this;
+    }
+
+    template <typename U>
+    __device__ __host__ auto operator*(const U& rhs) const
+    {
+        auto lhs = *this;
+        return lhs *= Treal(rhs);
+    }
+
+    template <typename U>
+    __device__ __host__ auto& operator/=(const U& rhs)
+    {
+        return (x /= Treal(rhs)), (y /= Treal(rhs)), *this;
+    }
+
+    template <typename U>
+    __device__ __host__ auto operator/(const U& rhs) const
+    {
+        auto lhs = *this;
+        return lhs /= Treal(rhs);
+    }
+
+    template <typename U>
+    __device__ __host__ constexpr bool operator==(const U& rhs) const
+    {
+        return x == Treal(rhs) && y == 0;
+    }
+
+    template <typename U>
+    __device__ __host__ constexpr bool operator!=(const U& rhs) const
+    {
+        return !(*this == rhs);
+    }
+};
+
+// Stream operators
+#if !defined(__HIPCC_RTC__)
+static std::ostream& operator<<(std::ostream& stream, const _Float16& f)
+{
+    return stream << static_cast<double>(f);
+}
+
+template <typename Treal>
+std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z)
+{
+    return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')';
+}
+#endif
+
+// Operators for real-complex computations
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return {Treal(lhs) + rhs.x, rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return {Treal(lhs) - rhs.x, -rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y};
+}
+
+template <typename U, typename Treal>
+__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    // Form of Robert L. Smith's Algorithm 116
+    if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y))
+    {
+        Treal ratio = rhs.y / rhs.x;
+        Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio);
+        return {scale, -scale * ratio};
+    }
+    else
+    {
+        Treal ratio = rhs.x / rhs.y;
+        Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y);
+        return {ratio * scale, -scale};
+    }
+}
+
+template <typename U, typename Treal>
+__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return Treal(lhs) == rhs.x && 0 == rhs.y;
+}
+
+template <typename U, typename Treal>
+__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs)
+{
+    return !(lhs == rhs);
+}
+
+// Extending std namespace to handle rocfft_complex datatype
+namespace std
+{
+    template <typename Treal>
+    __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z)
+    {
+        return z.x;
+    }
+
+    template <typename Treal>
+    __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z)
+    {
+        return z.y;
+    }
+
+    template <typename Treal>
+    __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z)
+    {
+        return {z.x, -z.y};
+    }
+
+    template <typename Treal>
+    __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z)
+    {
+        return (z.x * z.x) + (z.y * z.y);
+    }
+
+    template <typename Treal>
+    __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z)
+    {
+        Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y);
+        return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1))
+               : ti    ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1))
+                       : 0;
+    }
+}
+
+#endif // ROCFFT_COMPLEX_H
diff --git a/shared/rocfft_hip.h b/shared/rocfft_hip.h
new file mode 100644
index 0000000..e086cab
--- /dev/null
+++ b/shared/rocfft_hip.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef __ROCFFT_HIP_H__
+#define __ROCFFT_HIP_H__
+
+#include <hip/hip_runtime_api.h>
+#include <stdexcept>
+
+class rocfft_scoped_device
+{
+public:
+    rocfft_scoped_device(int device)
+    {
+        if(hipGetDevice(&orig_device) != hipSuccess)
+            throw std::runtime_error("hipGetDevice failure");
+
+        if(hipSetDevice(device) != hipSuccess)
+            throw std::runtime_error("hipSetDevice failure");
+    }
+    ~rocfft_scoped_device()
+    {
+        (void)hipSetDevice(orig_device);
+    }
+
+    // not copyable or movable
+    rocfft_scoped_device(const rocfft_scoped_device&) = delete;
+    rocfft_scoped_device(rocfft_scoped_device&&)      = delete;
+    rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete;
+
+private:
+    int orig_device;
+};
+
+#endif // __ROCFFT_HIP_H__
diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h
new file mode 100644
index 0000000..bf9b728
--- /dev/null
+++ b/shared/rocfft_params.h
@@ -0,0 +1,585 @@
+// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCFFT_PARAMS_H
+#define ROCFFT_PARAMS_H
+
+#include "../shared/fft_params.h"
+#include "../shared/gpubuf.h"
+#include "rocfft/rocfft.h"
+
+// Return the string of the rocfft_status code
+static std::string rocfft_status_to_string(const rocfft_status ret)
+{
+    switch(ret)
+    {
+    case rocfft_status_success:
+        return "rocfft_status_success";
+    case rocfft_status_failure:
+        return "rocfft_status_failure";
+    case rocfft_status_invalid_arg_value:
+        return "rocfft_status_invalid_arg_value";
+    case rocfft_status_invalid_dimensions:
+        return "rocfft_status_invalid_dimensions";
+    case rocfft_status_invalid_array_type:
+        return "rocfft_status_invalid_array_type";
+    case rocfft_status_invalid_strides:
+        return "rocfft_status_invalid_strides";
+    case rocfft_status_invalid_distance:
+        return "rocfft_status_invalid_distance";
+    case rocfft_status_invalid_offset:
+        return "rocfft_status_invalid_offset";
+    case rocfft_status_invalid_work_buffer:
+        return "rocfft_status_invalid_work_buffer";
+    default:
+        throw std::runtime_error("unknown rocfft_status");
+    }
+}
+
+inline fft_status fft_status_from_rocfftparams(const rocfft_status val)
+{
+    switch(val)
+    {
+    case rocfft_status_success:
+        return fft_status_success;
+    case rocfft_status_failure:
+        return fft_status_failure;
+    case rocfft_status_invalid_arg_value:
+        return fft_status_invalid_arg_value;
+    case rocfft_status_invalid_dimensions:
+        return fft_status_invalid_dimensions;
+    case rocfft_status_invalid_array_type:
+        return fft_status_invalid_array_type;
+    case rocfft_status_invalid_strides:
+        return fft_status_invalid_strides;
+    case rocfft_status_invalid_distance:
+        return fft_status_invalid_distance;
+    case rocfft_status_invalid_offset:
+        return fft_status_invalid_offset;
+    case rocfft_status_invalid_work_buffer:
+        return fft_status_invalid_work_buffer;
+    default:
+        throw std::runtime_error("Invalid status");
+    }
+}
+
+inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val)
+{
+    switch(val)
+    {
+    case fft_precision_single:
+        return rocfft_precision_single;
+    case fft_precision_double:
+        return rocfft_precision_double;
+    case fft_precision_half:
+        return rocfft_precision_half;
+    default:
+        throw std::runtime_error("Invalid precision");
+    }
+}
+
+inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val)
+{
+    switch(val)
+    {
+    case fft_array_type_complex_interleaved:
+        return rocfft_array_type_complex_interleaved;
+    case fft_array_type_complex_planar:
+        return rocfft_array_type_complex_planar;
+    case fft_array_type_real:
+        return rocfft_array_type_real;
+    case fft_array_type_hermitian_interleaved:
+        return rocfft_array_type_hermitian_interleaved;
+    case fft_array_type_hermitian_planar:
+        return rocfft_array_type_hermitian_planar;
+    case fft_array_type_unset:
+        return rocfft_array_type_unset;
+    }
+    return rocfft_array_type_unset;
+}
+
+inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val)
+{
+    switch(val)
+    {
+    case fft_transform_type_complex_forward:
+        return rocfft_transform_type_complex_forward;
+    case fft_transform_type_complex_inverse:
+        return rocfft_transform_type_complex_inverse;
+    case fft_transform_type_real_forward:
+        return rocfft_transform_type_real_forward;
+    case fft_transform_type_real_inverse:
+        return rocfft_transform_type_real_inverse;
+    default:
+        throw std::runtime_error("Invalid transform type");
+    }
+}
+
+inline rocfft_result_placement
+    rocfft_result_placement_from_fftparams(const fft_result_placement val)
+{
+    switch(val)
+    {
+    case fft_placement_inplace:
+        return rocfft_placement_inplace;
+    case fft_placement_notinplace:
+        return rocfft_placement_notinplace;
+    default:
+        throw std::runtime_error("Invalid result placement");
+    }
+}
+
+class rocfft_params : public fft_params
+{
+public:
+    rocfft_plan             plan = nullptr;
+    rocfft_execution_info   info = nullptr;
+    rocfft_plan_description desc = nullptr;
+    gpubuf_t<void>          wbuffer;
+
+    explicit rocfft_params(){};
+
+    explicit rocfft_params(const fft_params& p)
+        : fft_params(p){};
+
+    rocfft_params(const rocfft_params&) = delete;
+    rocfft_params& operator=(const rocfft_params&) = delete;
+
+    ~rocfft_params()
+    {
+        free();
+    };
+
+    void free()
+    {
+        if(plan != nullptr)
+        {
+            rocfft_plan_destroy(plan);
+            plan = nullptr;
+        }
+        if(info != nullptr)
+        {
+            rocfft_execution_info_destroy(info);
+            info = nullptr;
+        }
+        if(desc != nullptr)
+        {
+            rocfft_plan_description_destroy(desc);
+            desc = nullptr;
+        }
+        wbuffer.free();
+    }
+
+    void validate_fields() const override
+    {
+        // row-major lengths including batch (i.e. batch is at the front)
+        std::vector<size_t> length_with_batch{nbatch};
+        std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch));
+
+        auto validate_field = [&](const fft_field& f) {
+            for(const auto& b : f.bricks)
+            {
+                // bricks must have same dim as FFT, including batch
+                if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1
+                   || b.stride.size() != length.size() + 1)
+                    throw std::runtime_error(
+                        "brick dimension does not match FFT + batch dimension");
+
+                // ensure lower < upper, and that both fit in the FFT + batch dims
+                if(!std::lexicographical_compare(
+                       b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end()))
+                    throw std::runtime_error("brick lower index is not less than upper index");
+
+                if(!std::lexicographical_compare(b.lower.begin(),
+                                                 b.lower.end(),
+                                                 length_with_batch.begin(),
+                                                 length_with_batch.end()))
+                    throw std::runtime_error(
+                        "brick lower index is not less than FFT + batch length");
+
+                if(!std::lexicographical_compare(b.upper.begin(),
+                                                 b.upper.end(),
+                                                 length_with_batch.begin(),
+                                                 length_with_batch.end())
+                   && b.upper != length_with_batch)
+                    throw std::runtime_error("brick upper index is not <= FFT + batch length");
+            }
+        };
+
+        for(const auto& ifield : ifields)
+            validate_field(ifield);
+        for(const auto& ofield : ofields)
+            validate_field(ofield);
+    }
+
+    rocfft_precision get_rocfft_precision()
+    {
+        return rocfft_precision_from_fftparams(precision);
+    }
+
+    size_t vram_footprint() override
+    {
+        size_t val = fft_params::vram_footprint();
+        if(setup_structs() != fft_status_success)
+        {
+            throw std::runtime_error("Struct setup failed");
+        }
+        val += workbuffersize;
+
+        return val;
+    }
+
+    // Convert the generic fft_field structure to a rocfft_field
+    // structure that can be passed to rocFFT.  In particular, we need
+    // to convert from row-major to column-major.
+    static rocfft_field fft_field_to_rocfft_field(const fft_field& f)
+    {
+        rocfft_field rfield = nullptr;
+        if(f.bricks.empty())
+            return rfield;
+
+        if(rocfft_field_create(&rfield) != rocfft_status_success)
+            throw std::runtime_error("rocfft_field_create failed");
+        for(const auto& b : f.bricks)
+        {
+            // rocFFT wants column-major bricks and fft_params stores
+            // row-major
+            std::vector<size_t> lower_cm;
+            std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm));
+            std::vector<size_t> upper_cm;
+            std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm));
+            std::vector<size_t> stride_cm;
+            std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm));
+
+            rocfft_brick rbrick = nullptr;
+            if(rocfft_brick_create(&rbrick,
+                                   lower_cm.data(), // field_lower
+                                   upper_cm.data(), // field_upper
+                                   stride_cm.data(), // brick_stride
+                                   lower_cm.size(), // dim
+                                   b.device) // deviceID
+               != rocfft_status_success)
+                throw std::runtime_error("rocfft_brick_create failed");
+
+            if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success)
+                throw std::runtime_error("rocfft_field_add_brick failed");
+
+            rocfft_brick_destroy(rbrick);
+        }
+        return rfield;
+    }
+
+    fft_status setup_structs()
+    {
+        rocfft_status fft_status = rocfft_status_success;
+        if(desc == nullptr)
+        {
+            rocfft_plan_description_create(&desc);
+            if(fft_status != rocfft_status_success)
+                return fft_status_from_rocfftparams(fft_status);
+
+            fft_status
+                = rocfft_plan_description_set_data_layout(desc,
+                                                          rocfft_array_type_from_fftparams(itype),
+                                                          rocfft_array_type_from_fftparams(otype),
+                                                          ioffset.data(),
+                                                          ooffset.data(),
+                                                          istride_cm().size(),
+                                                          istride_cm().data(),
+                                                          idist,
+                                                          ostride_cm().size(),
+                                                          ostride_cm().data(),
+                                                          odist);
+            if(fft_status != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_plan_description_set_data_layout failed");
+            }
+
+            if(scale_factor != 1.0)
+            {
+                fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor);
+                if(fft_status != rocfft_status_success)
+                {
+                    throw std::runtime_error("rocfft_plan_description_set_scale_factor failed");
+                }
+            }
+
+            for(const auto& ifield : ifields)
+            {
+                rocfft_field infield = fft_field_to_rocfft_field(ifield);
+                if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success)
+                    throw std::runtime_error("rocfft_description_add_infield failed");
+                rocfft_field_destroy(infield);
+            }
+
+            for(const auto& ofield : ofields)
+            {
+                rocfft_field outfield = fft_field_to_rocfft_field(ofield);
+                if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success)
+                    throw std::runtime_error("rocfft_description_add_outfield failed");
+                rocfft_field_destroy(outfield);
+            }
+        }
+
+        if(plan == nullptr)
+        {
+            fft_status = rocfft_plan_create(&plan,
+                                            rocfft_result_placement_from_fftparams(placement),
+                                            rocfft_transform_type_from_fftparams(transform_type),
+                                            get_rocfft_precision(),
+                                            length_cm().size(),
+                                            length_cm().data(),
+                                            nbatch,
+                                            desc);
+            if(fft_status != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_plan_create failed");
+            }
+        }
+
+        if(info == nullptr)
+        {
+            fft_status = rocfft_execution_info_create(&info);
+            if(fft_status != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_execution_info_create failed");
+            }
+        }
+
+        fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize);
+        if(fft_status != rocfft_status_success)
+        {
+            throw std::runtime_error("rocfft_plan_get_work_buffer_size failed");
+        }
+
+        return fft_status_from_rocfftparams(fft_status);
+    }
+
+    fft_status create_plan() override
+    {
+        fft_status ret = setup_structs();
+        if(ret != fft_status_success)
+        {
+            return ret;
+        }
+        if(workbuffersize > 0)
+        {
+            hipError_t hip_status = hipSuccess;
+            hip_status            = wbuffer.alloc(workbuffersize);
+            if(hip_status != hipSuccess)
+            {
+                std::ostringstream oss;
+                oss << "work buffer allocation failed (" << workbuffersize << " requested)";
+                size_t mem_free  = 0;
+                size_t mem_total = 0;
+                hip_status       = hipMemGetInfo(&mem_free, &mem_total);
+                if(hip_status == hipSuccess)
+                {
+                    oss << "free vram: " << mem_free << " total vram: " << mem_total;
+                }
+                else
+                {
+                    oss << "hipMemGetInfo also failed";
+                }
+                throw work_buffer_alloc_failure(oss.str());
+            }
+
+            auto rocret
+                = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize);
+            if(rocret != rocfft_status_success)
+            {
+                throw std::runtime_error("rocfft_execution_info_set_work_buffer failed");
+            }
+        }
+
+        return ret;
+    }
+
+    fft_status set_callbacks(void* load_cb_host,
+                             void* load_cb_data,
+                             void* store_cb_host,
+                             void* store_cb_data) override
+    {
+        if(run_callbacks)
+        {
+            auto roc_status
+                = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0);
+            if(roc_status != rocfft_status_success)
+                return fft_status_from_rocfftparams(roc_status);
+
+            roc_status
+                = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0);
+            if(roc_status != rocfft_status_success)
+                return fft_status_from_rocfftparams(roc_status);
+        }
+        return fft_status_success;
+    }
+
+    fft_status execute(void** in, void** out) override
+    {
+        auto ret = rocfft_execute(plan, in, out, info);
+        return fft_status_from_rocfftparams(ret);
+    }
+
+    // scatter data to multiple GPUs and adjust I/O buffers to match
+    void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
+                           std::vector<void*>&  pibuffer,
+                           std::vector<void*>&  pobuffer) override
+    {
+        auto alloc_fields = [&](const fft_params::fft_field& field,
+                                fft_array_type               array_type,
+                                std::vector<void*>&          pbuffer,
+                                bool                         copy_input) {
+            if(field.bricks.empty())
+                return;
+
+            // we have a field defined, clear the list of buffers as
+            // we'll be allocating new ones for each brick
+            pbuffer.clear();
+
+            for(const auto& b : field.bricks)
+            {
+                // get brick's length - note that this includes batch
+                // dimension
+                const auto brick_len    = b.length();
+                const auto brick_stride = b.stride;
+
+                const size_t brick_size_elems = product(brick_len.begin(), brick_len.end());
+                const size_t elem_size_bytes  = var_size<size_t>(precision, array_type);
+                const size_t brick_size_bytes = brick_size_elems * elem_size_bytes;
+
+                // set device for the alloc, but we want to return to the
+                // default device as the source of a following memcpy
+                {
+                    rocfft_scoped_device dev(b.device);
+                    multi_gpu_data.emplace_back();
+                    if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess)
+                        throw std::runtime_error("device allocation failure");
+                    pbuffer.push_back(multi_gpu_data.back().data());
+                }
+
+                if(copy_input)
+                {
+                    // For now, assume we're only splitting on highest FFT
+                    // dimension, lower-dimensional FFT data is all
+                    // contiguous, and batches are contiguous in each brick.
+                    //
+                    // That means we can express this as a 2D memcpy.
+                    const size_t unbatched_elems_per_brick
+                        = product(brick_len.begin() + 1, brick_len.end());
+                    const size_t unbatched_elems_per_fft = product(length.begin(), length.end());
+
+                    // get this brick's starting offset in the field
+                    const size_t brick_offset
+                        = b.lower_field_offset(istride, idist) * elem_size_bytes;
+
+                    // copy from original input - note that we're
+                    // assuming interleaved data so ibuffer has only one
+                    // gpubuf
+                    if(hipMemcpy2D(pbuffer.back(),
+                                   unbatched_elems_per_brick * elem_size_bytes,
+                                   ibuffer.front().data_offset(brick_offset),
+                                   unbatched_elems_per_fft * elem_size_bytes,
+                                   unbatched_elems_per_brick * elem_size_bytes,
+                                   brick_len.front(),
+                                   hipMemcpyHostToDevice)
+                       != hipSuccess)
+                        throw std::runtime_error("hipMemcpy failure");
+                }
+            }
+
+            // if we copied the input to all the other devices, and
+            // this is an out-of-place transform, we no longer
+            // need the original input
+            if(copy_input && placement == fft_placement_notinplace)
+                ibuffer.clear();
+        };
+
+        // assume one input, one output field for simple cases
+        if(!ifields.empty())
+            alloc_fields(ifields.front(), itype, pibuffer, true);
+        if(!ofields.empty())
+        {
+            if(!ifields.empty() && placement == fft_placement_inplace)
+                pobuffer = pibuffer;
+            else
+                alloc_fields(ofields.front(), otype, pobuffer, false);
+        }
+    }
+
+    // when preparing for multi-GPU transform, we need to allocate data
+    // on each GPU.  This vector remembers all of those allocations.
+    std::vector<gpubuf> multi_gpu_data;
+
+    // gather data after multi-GPU FFT for verification
+    void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) override
+    {
+        if(ofields.empty())
+            return;
+
+        for(size_t i = 0; i < ofields.front().bricks.size(); ++i)
+        {
+            const auto& b         = ofields.front().bricks[i];
+            const auto& brick_ptr = pobuffer[i];
+
+            const auto brick_len = b.length();
+
+            const size_t elem_size_bytes = var_size<size_t>(precision, otype);
+
+            // get this brick's starting offset in the field
+            const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes;
+
+            // switch device to where we're copying from
+            rocfft_scoped_device dev(b.device);
+
+            // For now, assume we're only splitting on highest FFT
+            // dimension, lower-dimensional FFT data is all
+            // contiguous, and batches are contiguous in each brick.
+            //
+            // That means we can express this as a 2D memcpy.
+            const size_t unbatched_elems_per_brick
+                = product(brick_len.begin() + 1, brick_len.end());
+            const auto   output_length = olength();
+            const size_t unbatched_elems_per_fft
+                = product(output_length.begin(), output_length.end());
+
+            // copy to original output buffer - note that
+            // we're assuming interleaved data so obuffer
+            // has only one gpubuf
+            if(hipMemcpy2D(obuffer.front().data_offset(brick_offset),
+                           unbatched_elems_per_fft * elem_size_bytes,
+                           brick_ptr,
+                           unbatched_elems_per_brick * elem_size_bytes,
+                           unbatched_elems_per_brick * elem_size_bytes,
+                           brick_len.front(),
+                           hipMemcpyDeviceToDevice)
+               != hipSuccess)
+                throw std::runtime_error("hipMemcpy failure");
+
+            // device-to-device transfers don't synchronize with the
+            // host, add explicit sync
+            (void)hipDeviceSynchronize();
+        }
+        pobuffer.clear();
+        pobuffer.push_back(obuffer.front().data());
+    }
+};
+
+#endif
diff --git a/shared/test_params.h b/shared/test_params.h
new file mode 100644
index 0000000..8d8f6f7
--- /dev/null
+++ b/shared/test_params.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+#ifndef TESTCONSTANTS_H
+#define TESTCONSTANTS_H
+
+#include <stdexcept>
+
+extern int    verbose;
+extern size_t ramgb;
+extern size_t vramgb;
+
+extern size_t n_random_tests;
+
+extern size_t random_seed;
+extern double planar_prob;
+extern double callback_prob;
+
+extern double half_epsilon;
+extern double single_epsilon;
+extern double double_epsilon;
+extern bool   skip_runtime_fails;
+
+extern double max_linf_eps_double;
+extern double max_l2_eps_double;
+extern double max_linf_eps_single;
+extern double max_l2_eps_single;
+extern double max_linf_eps_half;
+extern double max_l2_eps_half;
+
+extern int n_hip_failures;
+
+#endif
diff --git a/shared/work_queue.h b/shared/work_queue.h
new file mode 100644
index 0000000..e13fc41
--- /dev/null
+++ b/shared/work_queue.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+template <typename _WorkItem>
+struct WorkQueue
+{
+    void push(_WorkItem&& i)
+    {
+        std::unique_lock<std::mutex> lock(queueMutex);
+        items.emplace(std::move(i));
+        emptyWait.notify_all();
+    }
+    _WorkItem pop()
+    {
+        std::unique_lock<std::mutex> lock(queueMutex);
+        while(items.empty())
+            emptyWait.wait(lock);
+        _WorkItem item(items.front());
+        items.pop();
+        return item;
+    }
+
+private:
+    std::queue<_WorkItem>   items;
+    std::mutex              queueMutex;
+    std::condition_variable emptyWait;
+};