diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 3964809..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "rocFFT"] - path = clients/rocFFT - url = https://github.com/ROCmSoftwarePlatform/rocFFT.git diff --git a/CHANGELOG.md b/CHANGELOG.md index e0a2f42..cc23af9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Documentation for hipFFT is available at ## hipFFT 1.0.14 for ROCm 6.1.0 +### Changes + +* When building hipFFT from source, rocFFT code no longer needs to be initialized as a git submodule. + ### Fixes * Fixed error when creating length-1 plans. diff --git a/README.md b/README.md index a852205..135b4f3 100644 --- a/README.md +++ b/README.md @@ -61,8 +61,6 @@ To build hipFFT from source, follow these steps: * The clients (samples, tests, etc) included with the hipFFT source depend on FFTW, GoogleTest, and boost program options. - * The bench and test clients also require the rocFFT source tree (`git submodule update --init`). - 3. Build hipFFT: To show all build options: diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt index 1db0d9c..b99a9e5 100644 --- a/clients/CMakeLists.txt +++ b/clients/CMakeLists.txt @@ -65,21 +65,6 @@ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" AND NOT CMAKE_CXX_COMPILER_ID STR endif() -if( GIT_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git" ) - message(STATUS "rocFFT submodule update") - execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/clients/rocFFT - RESULT_VARIABLE GIT_SUBMOD_RESULT) - if( NOT GIT_SUBMOD_RESULT EQUAL "0" ) - message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules manually.") - endif( ) -endif( ) - -if( NOT EXISTS "${CMAKE_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt" ) - message(FATAL_ERROR "The rocFFT submodule is not present! Please update git submodules and try again. ${CMAKE_CURRENT_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt") -endif( ) - - # This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on # all the time # This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt index b5cef9b..ccb8c29 100644 --- a/clients/bench/CMakeLists.txt +++ b/clients/bench/CMakeLists.txt @@ -26,8 +26,8 @@ find_package( Boost COMPONENTS program_options REQUIRED) set( Boost_USE_STATIC_LIBS OFF ) -set( hipfft_bench_source bench.cpp ../rocFFT/shared/array_validator.cpp ) -set( hipfft_bench_includes bench.h ../rocFFT/shared/array_validator.h ) +set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp ) +set( hipfft_bench_includes bench.h ../../shared/array_validator.h ) add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} ) diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp index ca60896..d2778de 100644 --- a/clients/bench/bench.cpp +++ b/clients/bench/bench.cpp @@ -29,7 +29,7 @@ #include namespace po = boost::program_options; -#include "../rocFFT/shared/gpubuf.h" +#include "../../shared/gpubuf.h" int main(int argc, char* argv[]) { diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h index b8b58ac..75d9db9 100644 --- a/clients/hipfft_params.h +++ b/clients/hipfft_params.h @@ -23,9 +23,9 @@ #include +#include "../shared/fft_params.h" #include "hipfft/hipfft.h" #include "hipfft/hipfftXt.h" -#include "rocFFT/shared/fft_params.h" inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val) { diff --git a/clients/rocFFT b/clients/rocFFT deleted file mode 160000 index d1c9113..0000000 --- a/clients/rocFFT +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d1c91135da99acd2c690e9aae619642ab57b0914 diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 9742a45..2d1aac0 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -37,14 +37,7 @@ set( hipfft-test_source accuracy_test_3D.cpp accuracy_test_callback.cpp multi_device_test.cpp - ../rocFFT/shared/array_validator.cpp - ) - -set( hipfft-test_includes - ../rocFFT/clients/tests/fftw_transform.h - ../rocFFT/clients/tests/rocfft_against_fftw.h - ../rocFFT/clients/tests/misc/include/test_exception.h - ../rocFFT/shared/array_validator.h + ../../shared/array_validator.cpp ) add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} ) @@ -56,8 +49,6 @@ target_include_directories( $ $ $ - $ - $ ) diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp index 27e849d..57d846a 100644 --- a/clients/tests/accuracy_test_1D.cpp +++ b/clients/tests/accuracy_test_1D.cpp @@ -23,11 +23,11 @@ #include #include -#include "../rocFFT/shared/fft_params.h" +#include "../../shared/fft_params.h" -#include "accuracy_test.h" -#include "fftw_transform.h" -#include "rocfft_against_fftw.h" +#include "../../shared/accuracy_test.h" +#include "../../shared/fftw_transform.h" +#include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp index 1674593..6f618c0 100644 --- a/clients/tests/accuracy_test_2D.cpp +++ b/clients/tests/accuracy_test_2D.cpp @@ -23,11 +23,11 @@ #include #include -#include "../rocFFT/shared/fft_params.h" +#include "../../shared/fft_params.h" -#include "accuracy_test.h" -#include "fftw_transform.h" -#include "rocfft_against_fftw.h" +#include "../../shared/accuracy_test.h" +#include "../../shared/fftw_transform.h" +#include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp index a87476a..941ec24 100644 --- a/clients/tests/accuracy_test_3D.cpp +++ b/clients/tests/accuracy_test_3D.cpp @@ -23,11 +23,11 @@ #include #include -#include "../rocFFT/shared/fft_params.h" +#include "../../shared/fft_params.h" -#include "accuracy_test.h" -#include "fftw_transform.h" -#include "rocfft_against_fftw.h" +#include "../../shared/accuracy_test.h" +#include "../../shared/fftw_transform.h" +#include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp index 4782830..b5cc4a7 100644 --- a/clients/tests/accuracy_test_callback.cpp +++ b/clients/tests/accuracy_test_callback.cpp @@ -18,7 +18,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include "accuracy_test.h" +#include "../../shared/accuracy_test.h" std::vector> callback_sizes = { // some single kernel sizes diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp index 1f0ae83..2f7674e 100644 --- a/clients/tests/gtest_main.cpp +++ b/clients/tests/gtest_main.cpp @@ -30,10 +30,10 @@ #include #include +#include "../../shared/concurrency.h" +#include "../../shared/environment.h" +#include "../../shared/work_queue.h" #include "../hipfft_params.h" -#include "../rocFFT/shared/concurrency.h" -#include "../rocFFT/shared/environment.h" -#include "../rocFFT/shared/work_queue.h" #include "hipfft/hipfft.h" #include "hipfft_accuracy_test.h" #include "hipfft_test_params.h" diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp index 2abaf74..609239a 100644 --- a/clients/tests/hipfft_accuracy_test.cpp +++ b/clients/tests/hipfft_accuracy_test.cpp @@ -29,11 +29,12 @@ #include "hipfft/hipfft.h" #include "../hipfft_params.h" -#include "../rocFFT/clients/tests/fftw_transform.h" -#include "../rocFFT/clients/tests/rocfft_accuracy_test.h" -#include "../rocFFT/clients/tests/rocfft_against_fftw.h" -#include "../rocFFT/shared/gpubuf.h" -#include "../rocFFT/shared/rocfft_complex.h" + +#include "../../shared/accuracy_test.h" +#include "../../shared/fftw_transform.h" +#include "../../shared/gpubuf.h" +#include "../../shared/rocfft_against_fftw.h" +#include "../../shared/rocfft_complex.h" void fft_vs_reference(hipfft_params& params, bool round_trip) { diff --git a/clients/tests/hipfft_accuracy_test.h b/clients/tests/hipfft_accuracy_test.h index 0491bd9..181150e 100644 --- a/clients/tests/hipfft_accuracy_test.h +++ b/clients/tests/hipfft_accuracy_test.h @@ -23,8 +23,8 @@ #ifndef ROCFFT_ACCURACY_TEST #define ROCFFT_ACCURACY_TEST +#include "../../shared/accuracy_test.h" #include "../hipfft_params.h" -#include "../rocFFT/clients/tests/accuracy_test.h" void fft_vs_reference(hipfft_params& params, bool round_trip = false); diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp index b3dc4c9..3274b80 100644 --- a/clients/tests/multi_device_test.cpp +++ b/clients/tests/multi_device_test.cpp @@ -18,7 +18,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include "accuracy_test.h" +#include "../../shared/accuracy_test.h" #include #include diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 5810e37..bdbf689 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -21,9 +21,6 @@ # # ############################################################################# -# Git -find_package(Git REQUIRED) - # HIP if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp index 0278b88..b5e9079 100644 --- a/library/src/amd_detail/hipfft.cpp +++ b/library/src/amd_detail/hipfft.cpp @@ -27,10 +27,10 @@ #include #include -#include "../../../clients/rocFFT/shared/arithmetic.h" -#include "../../../clients/rocFFT/shared/gpubuf.h" -#include "../../../clients/rocFFT/shared/ptrdiff.h" -#include "../../../clients/rocFFT/shared/rocfft_hip.h" +#include "../../../shared/arithmetic.h" +#include "../../../shared/gpubuf.h" +#include "../../../shared/ptrdiff.h" +#include "../../../shared/rocfft_hip.h" #define ROC_FFT_CHECK_ALLOC_FAILED(ret) \ { \ diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h new file mode 100644 index 0000000..362a7c1 --- /dev/null +++ b/shared/accuracy_test.h @@ -0,0 +1,1949 @@ +// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#ifndef ACCURACY_TEST +#define ACCURACY_TEST + +#include +#include +#include +#include +#include +#include + +#include "enum_to_string.h" +#include "fft_params.h" +#include "fftw_transform.h" +#include "gpubuf.h" +#include "rocfft_against_fftw.h" +#include "test_params.h" + +extern int verbose; +extern size_t ramgb; +extern bool fftw_compare; + +static const size_t ONE_GiB = 1 << 30; + +inline size_t bytes_to_GiB(const size_t bytes) +{ + return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB; +} + +typedef std::tuple + type_place_io_t; + +// Remember the results of the last FFT we computed with FFTW. Tests +// are ordered so that later cases can often reuse this result. +struct last_cpu_fft_cache +{ + // keys to the cache + std::vector length; + size_t nbatch = 0; + fft_transform_type transform_type = fft_transform_type_complex_forward; + bool run_callbacks = false; + fft_precision precision = fft_precision_single; + + // FFTW input/output + std::vector cpu_input; + std::vector cpu_output; +}; +extern last_cpu_fft_cache last_cpu_fft_data; + +struct system_memory +{ + size_t total_bytes = 0; + size_t free_bytes = 0; +}; +extern system_memory start_memory; + +system_memory get_system_memory(); + +// Estimate the amount of host memory needed for buffers. +inline size_t needed_ram_buffers(const fft_params& params, const int verbose) +{ + // This calculation is assuming contiguous data but noncontiguous buffers + // are assumed to require a close enough amount of space for the purposes + // of this estimate. + + size_t needed_ram = 6 + * std::accumulate(params.length.begin(), + params.length.end(), + static_cast(1), + std::multiplies()); + + // Account for precision and data type: + if(params.transform_type != fft_transform_type_real_forward + && params.transform_type != fft_transform_type_real_inverse) + { + needed_ram *= 2; + } + switch(params.precision) + { + case fft_precision_half: + needed_ram *= 2; + break; + case fft_precision_single: + needed_ram *= 4; + break; + case fft_precision_double: + needed_ram *= 8; + break; + } + + needed_ram *= params.nbatch; + + if(verbose) + { + std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n"; + } + + return needed_ram; +} + +template +bool fftw_plan_uses_bluestein(const typename fftw_trait::fftw_plan_type& cpu_plan) +{ +#ifdef FFTW_HAVE_SPRINT_PLAN + char* print_plan_c_str = fftw_sprint_plan(cpu_plan); + std::string print_plan(print_plan_c_str); + free(print_plan_c_str); + return print_plan.find("bluestein") != std::string::npos; +#else + // assume worst case (bluestein is always used) + return true; +#endif +} + +// Estimate the amount of host memory needed for fftw. +template +inline size_t needed_ram_fftw(const fft_params& contiguous_params, + const typename fftw_trait::fftw_plan_type& cpu_plan, + const int verbose) +{ + size_t total_length = std::accumulate(contiguous_params.length.begin(), + contiguous_params.length.end(), + static_cast(1), + std::multiplies()); + size_t needed_ram = 0; + // Detect Bluestein in plan + if(fftw_plan_uses_bluestein(cpu_plan)) + { + for(size_t dim : contiguous_params.length) + { + unsigned int needed_ram_dim = dim; + + // Next-plus-one-power-of-two multiplied any other lengths + needed_ram_dim--; + + needed_ram_dim |= needed_ram_dim >> 2; + needed_ram_dim |= needed_ram_dim >> 4; + needed_ram_dim |= needed_ram_dim >> 8; + needed_ram_dim |= needed_ram_dim >> 16; + + needed_ram_dim++; + + needed_ram_dim *= 2 * (total_length / dim); + + if(needed_ram_dim > needed_ram) + { + needed_ram = needed_ram_dim; + } + } + } + + // Account for precision and data type: + if(contiguous_params.transform_type != fft_transform_type_real_forward + && contiguous_params.transform_type != fft_transform_type_real_inverse) + { + needed_ram *= 2; + } + switch(contiguous_params.precision) + { + case fft_precision_half: + needed_ram *= 2; + break; + case fft_precision_single: + needed_ram *= 4; + break; + case fft_precision_double: + needed_ram *= 8; + break; + } + + needed_ram *= contiguous_params.nbatch; + + if(verbose) + { + std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n"; + } + + return needed_ram; +} + +// Base gtest class for comparison with FFTW. +class accuracy_test : public ::testing::TestWithParam +{ +protected: + void SetUp() override {} + void TearDown() override {} + +public: + static std::string TestName(const testing::TestParamInfo& info) + { + return info.param.token(); + } +}; + +const static std::vector batch_range = {2, 1}; + +const static std::vector precision_range_full + = {fft_precision_double, fft_precision_single, fft_precision_half}; +const static std::vector precision_range_sp_dp + = {fft_precision_double, fft_precision_single}; + +const static std::vector place_range + = {fft_placement_inplace, fft_placement_notinplace}; +const static std::vector trans_type_range + = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; +const static std::vector trans_type_range_complex + = {fft_transform_type_complex_forward}; +const static std::vector trans_type_range_real + = {fft_transform_type_real_forward}; + +// Given a vector of vector of lengths, generate all unique permutations. +// Add an optional vector of ad-hoc lengths to the result. +inline std::vector> + generate_lengths(const std::vector>& inlengths) +{ + std::vector> output; + if(inlengths.size() == 0) + { + return output; + } + const size_t dim = inlengths.size(); + std::vector looplength(dim); + for(unsigned int i = 0; i < dim; ++i) + { + looplength[i] = inlengths[i].size(); + } + for(unsigned int idx = 0; idx < inlengths.size(); ++idx) + { + std::vector index(dim); + do + { + std::vector length(dim); + for(unsigned int i = 0; i < dim; ++i) + { + length[i] = inlengths[i][index[i]]; + } + output.push_back(length); + } while(increment_rowmajor(index, looplength)); + } + // uniquify the result + std::sort(output.begin(), output.end()); + output.erase(std::unique(output.begin(), output.end()), output.end()); + return output; +} + +// Return the valid rocFFT input and output types for a given transform type. +inline std::vector> + iotypes(const fft_transform_type transformType, + const fft_result_placement place, + const bool planar = true) +{ + std::vector> iotypes; + switch(transformType) + { + case fft_transform_type_complex_forward: + case fft_transform_type_complex_inverse: + iotypes.push_back(std::make_pair( + fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)); + if(planar) + { + iotypes.push_back(std::make_pair( + fft_array_type_complex_planar, fft_array_type_complex_planar)); + if(place == fft_placement_notinplace) + { + iotypes.push_back(std::make_pair( + fft_array_type_complex_planar, fft_array_type_complex_interleaved)); + iotypes.push_back(std::make_pair( + fft_array_type_complex_interleaved, fft_array_type_complex_planar)); + } + } + break; + case fft_transform_type_real_forward: + iotypes.push_back(std::make_pair( + fft_array_type_real, fft_array_type_hermitian_interleaved)); + if(planar && place == fft_placement_notinplace) + { + iotypes.push_back(std::make_pair( + fft_array_type_real, fft_array_type_hermitian_planar)); + } + break; + case fft_transform_type_real_inverse: + iotypes.push_back(std::make_pair( + fft_array_type_hermitian_interleaved, fft_array_type_real)); + if(planar && place == fft_placement_notinplace) + { + iotypes.push_back(std::make_pair( + fft_array_type_hermitian_planar, fft_array_type_real)); + } + break; + default: + throw std::runtime_error("Invalid transform type"); + } + return iotypes; +} + +// Generate all combinations of input/output types, from combinations of transform and placement +// types. +static std::vector + generate_types(fft_transform_type transform_type, + const std::vector& place_range, + const bool planar) +{ + std::vector ret; + for(auto place : place_range) + { + for(auto iotype : iotypes(transform_type, place, planar)) + { + ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second)); + } + } + return ret; +} + +struct stride_generator +{ + struct stride_dist + { + stride_dist(const std::vector& s, size_t d) + : stride(s) + , dist(d) + { + } + std::vector stride; + size_t dist; + }; + + // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer + // + // cppcheck-suppress noExplicitConstructor + stride_generator(const std::vector>& stride_list_in) + : stride_list(stride_list_in) + { + } + virtual std::vector generate(const std::vector& lengths, + size_t batch) const + { + std::vector ret; + for(const auto& s : stride_list) + ret.emplace_back(s, 0); + return ret; + } + std::vector> stride_list; +}; + +// Generate strides such that batch is essentially the innermost dimension +// e.g. given a batch-2 4x3x2 transform which logically looks like: +// +// batch0: +// A B A B +// A B A B +// A B A B +// +// A B A B +// A B A B +// A B A B +// +// batch1: +// A B A B +// A B A B +// A B A B +// +// A B A B +// A B A B +// A B A B +// +// we instead do stride-2 4x3x2 transform where first batch is the +// A's and second batch is the B's. +struct stride_generator_3D_inner_batch : public stride_generator +{ + explicit stride_generator_3D_inner_batch(const std::vector>& stride_list_in) + : stride_generator(stride_list_in) + { + } + std::vector generate(const std::vector& lengths, + size_t batch) const override + { + std::vector ret = stride_generator::generate(lengths, batch); + std::vector strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch}; + ret.emplace_back(strides, 1); + return ret; + } +}; + +// Create an array of parameters to pass to gtest. Base generator +// that allows choosing transform type. +inline auto param_generator_base(const std::vector& type_range, + const std::vector>& v_lengths, + const std::vector& precision_range, + const std::vector& batch_range, + decltype(generate_types) types_generator, + const stride_generator& istride, + const stride_generator& ostride, + const std::vector>& ioffset_range, + const std::vector>& ooffset_range, + const std::vector& place_range, + const bool planar = true, + const bool run_callbacks = false) +{ + + std::vector params; + + // For any length, we compute double-precision CPU reference + // for largest batch size first and reuse for smaller batch + // sizes, then convert to single-precision. + + for(auto& transform_type : type_range) + { + for(const auto& lengths : v_lengths) + { + // try to ensure that we are given literal lengths, not + // something to be passed to generate_lengths + if(lengths.empty() || lengths.size() > 3) + { + continue; + } + { + for(const auto precision : precision_range) + { + for(const auto batch : batch_range) + { + for(const auto& types : + types_generator(transform_type, place_range, planar)) + { + for(const auto& istride_dist : istride.generate(lengths, batch)) + { + for(const auto& ostride_dist : ostride.generate(lengths, batch)) + { + for(const auto& ioffset : ioffset_range) + { + for(const auto& ooffset : ooffset_range) + { + fft_params param; + + param.length = lengths; + param.istride = istride_dist.stride; + param.ostride = ostride_dist.stride; + param.nbatch = batch; + param.precision = precision; + param.transform_type = std::get<0>(types); + param.placement = std::get<1>(types); + param.idist = istride_dist.dist; + param.odist = ostride_dist.dist; + param.itype = std::get<2>(types); + param.otype = std::get<3>(types); + param.ioffset = ioffset; + param.ooffset = ooffset; + + if(run_callbacks) + { + // add a test if both input and output support callbacks + if(param.itype != fft_array_type_complex_planar + && param.itype != fft_array_type_hermitian_planar + && param.otype != fft_array_type_complex_planar + && param.otype + != fft_array_type_hermitian_planar) + { + param.run_callbacks = true; + } + else + { + continue; + } + } + param.validate(); + + // Keeping the random number generator here + // allows one to run the same tests for a given + // random seed; ie the test suite is repeatable. + std::hash hasher; + std::ranlux24_base gen(random_seed + + hasher(param.token())); + std::uniform_real_distribution<> dis(0.0, 1.0); + + if(param.is_planar()) + { + const double roll = dis(gen); + if(roll > planar_prob) + { + if(verbose > 4) + { + std::cout << "Planar transform skipped " + "(planar_prob: " + << planar_prob << " > " << roll + << ")\n"; + } + continue; + } + } + if(run_callbacks) + { + const double roll = dis(gen); + if(roll > callback_prob) + { + + if(verbose > 4) + { + std::cout << "Callback transform skipped " + "(planar_prob: " + << planar_prob << " > " << roll + << ")\n"; + } + continue; + } + } + + if(param.valid(0)) + { + params.push_back(param); + } + } + } + } + } + } + } + } + } + } + } + return params; +} + +// Create an array of parameters to pass to gtest. Default generator +// that picks all transform types. +inline auto param_generator(const std::vector>& v_lengths, + const std::vector& precision_range, + const std::vector& batch_range, + const stride_generator& istride, + const stride_generator& ostride, + const std::vector>& ioffset_range, + const std::vector>& ooffset_range, + const std::vector& place_range, + const bool planar, + const bool run_callbacks = false) +{ + return param_generator_base(trans_type_range, + v_lengths, + precision_range, + batch_range, + generate_types, + istride, + ostride, + ioffset_range, + ooffset_range, + place_range, + planar, + run_callbacks); +} + +// Create an array of parameters to pass to gtest. Only tests complex-type transforms +inline auto param_generator_complex(const std::vector>& v_lengths, + const std::vector& precision_range, + const std::vector& batch_range, + const stride_generator& istride, + const stride_generator& ostride, + const std::vector>& ioffset_range, + const std::vector>& ooffset_range, + const std::vector& place_range, + const bool planar, + const bool run_callbacks = false) +{ + return param_generator_base(trans_type_range_complex, + v_lengths, + precision_range, + batch_range, + generate_types, + istride, + ostride, + ioffset_range, + ooffset_range, + place_range, + planar, + run_callbacks); +} + +// Create an array of parameters to pass to gtest. +inline auto param_generator_real(const std::vector>& v_lengths, + const std::vector& precision_range, + const std::vector& batch_range, + const stride_generator& istride, + const stride_generator& ostride, + const std::vector>& ioffset_range, + const std::vector>& ooffset_range, + const std::vector& place_range, + const bool planar, + const bool run_callbacks = false) +{ + return param_generator_base(trans_type_range_real, + v_lengths, + precision_range, + batch_range, + generate_types, + istride, + ostride, + ioffset_range, + ooffset_range, + place_range, + planar, + run_callbacks); +} + +template +auto param_generator_token(const Tcontainer& tokens) +{ + std::vector params; + params.reserve(tokens.size()); + for(auto t : tokens) + { + params.push_back({}); + params.back().from_token(t); + } + return params; +} + +struct callback_test_data +{ + // scalar to modify the input/output with + double scalar; + // base address of input, to ensure that each callback gets an offset from that base + void* base; +}; + +void* get_load_callback_host(fft_array_type itype, + fft_precision precision, + bool round_trip_inverse); +void apply_load_callback(const fft_params& params, std::vector& input); +void apply_store_callback(const fft_params& params, std::vector& output); +void* get_store_callback_host(fft_array_type otype, + fft_precision precision, + bool round_trip_inverse); + +static auto allocate_cpu_fft_buffer(const fft_precision precision, + const fft_array_type type, + const std::vector& size) +{ + // FFTW does not support half-precision, so we do single instead. + // So if we need to do a half-precision FFTW transform, allocate + // enough buffer for single-precision instead. + return allocate_host_buffer( + precision == fft_precision_half ? fft_precision_single : precision, type, size); +} + +template +inline void execute_cpu_fft(fft_params& params, + fft_params& contiguous_params, + typename fftw_trait::fftw_plan_type& cpu_plan, + std::vector& cpu_input, + std::vector& cpu_output) +{ + // CPU output might not be allocated already for us, if FFTW never + // needed an output buffer during planning + if(cpu_output.empty()) + cpu_output = allocate_cpu_fft_buffer( + contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); + + // If this is either C2R or callbacks are enabled, the + // input will be modified. So we need to modify the copy instead. + std::vector cpu_input_copy(cpu_input.size()); + std::vector* input_ptr = &cpu_input; + if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse) + { + for(size_t i = 0; i < cpu_input.size(); ++i) + { + cpu_input_copy[i] = cpu_input[i].copy(); + } + + input_ptr = &cpu_input_copy; + } + + // run FFTW (which may destroy CPU input) + apply_load_callback(params, *input_ptr); + fftw_run(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output); + // clean up + fftw_destroy_plan_type(cpu_plan); + // ask FFTW to fully clean up, since it tries to cache plan details + fftw_cleanup(); + cpu_plan = nullptr; + apply_store_callback(params, cpu_output); +} + +// execute the GPU transform +template +inline void execute_gpu_fft(Tparams& params, + std::vector& pibuffer, + std::vector& pobuffer, + std::vector& obuffer, + std::vector& gpu_output, + bool round_trip_inverse = false) +{ + gpubuf_t load_cb_data_dev; + gpubuf_t store_cb_data_dev; + if(params.run_callbacks) + { + void* load_cb_host + = get_load_callback_host(params.itype, params.precision, round_trip_inverse); + + callback_test_data load_cb_data_host; + + if(round_trip_inverse) + { + load_cb_data_host.scalar = params.store_cb_scalar; + } + else + { + load_cb_data_host.scalar = params.load_cb_scalar; + } + + load_cb_data_host.base = pibuffer.front(); + + auto hip_status = hipSuccess; + + hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data)); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } + hip_status = hipMemcpy(load_cb_data_dev.data(), + &load_cb_data_host, + sizeof(callback_test_data), + hipMemcpyHostToDevice); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } + + void* store_cb_host + = get_store_callback_host(params.otype, params.precision, round_trip_inverse); + + callback_test_data store_cb_data_host; + + if(round_trip_inverse) + { + store_cb_data_host.scalar = params.load_cb_scalar; + } + else + { + store_cb_data_host.scalar = params.store_cb_scalar; + } + + store_cb_data_host.base = pobuffer.front(); + + hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data)); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } + + hip_status = hipMemcpy(store_cb_data_dev.data(), + &store_cb_data_host, + sizeof(callback_test_data), + hipMemcpyHostToDevice); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } + + auto fft_status = params.set_callbacks( + load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data()); + if(fft_status != fft_status_success) + throw std::runtime_error("set callback failure"); + } + + // Execute the transform: + auto fft_status = params.execute(pibuffer.data(), pobuffer.data()); + if(fft_status != fft_status_success) + throw std::runtime_error("rocFFT plan execution failure"); + + // if not comparing, then just executing the GPU FFT is all we + // need to do + if(!fftw_compare) + return; + + // finalize a multi-GPU transform + params.multi_gpu_finalize(obuffer, pobuffer); + + ASSERT_TRUE(!gpu_output.empty()) << "no output buffers"; + for(unsigned int idx = 0; idx < gpu_output.size(); ++idx) + { + ASSERT_TRUE(gpu_output[idx].data() != nullptr) + << "output buffer index " << idx << " is empty"; + auto hip_status = hipMemcpy(gpu_output[idx].data(), + pobuffer.at(idx), + gpu_output[idx].size(), + hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure"; + } + else + { + GTEST_FAIL() << "hipMemcpy failure"; + } + } + } + if(verbose > 2) + { + std::cout << "GPU output:\n"; + params.print_obuffer(gpu_output); + } + if(verbose > 5) + { + std::cout << "flat GPU output:\n"; + params.print_obuffer_flat(gpu_output); + } +} + +template +static void assert_init_value(const std::vector& output, + const size_t idx, + const Tfloat orig_value); + +template <> +void assert_init_value(const std::vector& output, const size_t idx, const float orig_value) +{ + float actual_value = reinterpret_cast(output.front().data())[idx]; + ASSERT_EQ(actual_value, orig_value) << "index " << idx; +} + +template <> +void assert_init_value(const std::vector& output, + const size_t idx, + const double orig_value) +{ + double actual_value = reinterpret_cast(output.front().data())[idx]; + ASSERT_EQ(actual_value, orig_value) << "index " << idx; +} + +template <> +void assert_init_value(const std::vector& output, + const size_t idx, + const rocfft_complex orig_value) +{ + // if this is interleaved, check directly + if(output.size() == 1) + { + rocfft_complex actual_value + = reinterpret_cast*>(output.front().data())[idx]; + ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; + ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; + } + else + { + // planar + rocfft_complex actual_value{ + reinterpret_cast(output.front().data())[idx], + reinterpret_cast(output.back().data())[idx]}; + ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; + ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; + } +} + +template <> +void assert_init_value(const std::vector& output, + const size_t idx, + const rocfft_complex orig_value) +{ + // if this is interleaved, check directly + if(output.size() == 1) + { + rocfft_complex actual_value + = reinterpret_cast*>(output.front().data())[idx]; + ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; + ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; + } + else + { + // planar + rocfft_complex actual_value{ + reinterpret_cast(output.front().data())[idx], + reinterpret_cast(output.back().data())[idx]}; + ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; + ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; + } +} + +static const int OUTPUT_INIT_PATTERN = 0xcd; +template +void check_single_output_stride(const std::vector& output, + const size_t offset, + const std::vector& length, + const std::vector& stride, + const size_t i) +{ + Tfloat orig; + memset(static_cast(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat)); + + size_t curLength = length[i]; + size_t curStride = stride[i]; + size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1]; + size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1]; + + if(nextSmallerLength == 0) + { + // this is the fastest dim, indexes that are not multiples of + // the stride should be the initial value + for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx) + { + if(idx % curStride != 0) + assert_init_value(output, idx, orig); + } + } + else + { + for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx) + { + // check that the space after the next smaller dim and the + // end of this dim is initial value + for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx) + assert_init_value(output, idx, orig); + + check_single_output_stride( + output, offset + lengthIdx * curStride, length, stride, i + 1); + } + } +} + +template +void check_output_strides(const std::vector& output, Tparams& params) +{ + // treat batch+dist like highest length+stride, if batch > 1 + std::vector length; + std::vector stride; + if(params.nbatch > 1) + { + length.push_back(params.nbatch); + stride.push_back(params.odist); + } + + auto olength = params.olength(); + std::copy(olength.begin(), olength.end(), std::back_inserter(length)); + std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride)); + + if(params.precision == fft_precision_single) + { + if(params.otype == fft_array_type_real) + check_single_output_stride(output, 0, length, stride, 0); + else + check_single_output_stride>(output, 0, length, stride, 0); + } + else + { + if(params.otype == fft_array_type_real) + check_single_output_stride(output, 0, length, stride, 0); + else + check_single_output_stride>(output, 0, length, stride, 0); + } +} + +// run rocFFT inverse transform +template +inline void run_round_trip_inverse(Tparams& params, + std::vector& obuffer, + std::vector& pibuffer, + std::vector& pobuffer, + std::vector& gpu_output) +{ + params.validate(); + + // Make sure that the parameters make sense: + ASSERT_TRUE(params.valid(verbose)); + + // Create FFT plan - this will also allocate work buffer, but will throw a + // specific exception if that step fails + auto plan_status = fft_status_success; + try + { + plan_status = params.create_plan(); + } + catch(fft_params::work_buffer_alloc_failure& e) + { + std::stringstream ss; + ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")"; + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } + } + ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed"; + + auto obuffer_sizes = params.obuffer_sizes(); + + if(params.placement != fft_placement_inplace) + { + for(unsigned int i = 0; i < obuffer_sizes.size(); ++i) + { + // If we're validating output strides, init the + // output buffer to a known pattern and we can check + // that the pattern is untouched in places that + // shouldn't have been touched. + if(params.check_output_strides) + { + auto hip_status + = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemset failure"; + } + else + { + GTEST_FAIL() << "hipMemset failure"; + } + } + } + } + } + + // execute GPU transform + execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true); +} + +// compare rocFFT inverse transform with forward transform input +template +inline void compare_round_trip_inverse(Tparams& params, + fft_params& contiguous_params, + std::vector& gpu_output, + std::vector& cpu_input, + const VectorNorms& cpu_input_norm, + size_t total_length) +{ + if(params.check_output_strides) + { + check_output_strides(gpu_output, params); + } + + // compute GPU output norm + std::shared_future gpu_norm = std::async(std::launch::async, [&]() { + return norm(gpu_output, + params.olength(), + params.nbatch, + params.precision, + params.otype, + params.ostride, + params.odist, + params.ooffset); + }); + + // compare GPU inverse output to CPU forward input + std::unique_ptr>> linf_failures; + if(verbose > 1) + linf_failures = std::make_unique>>(); + const double linf_cutoff + = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length); + + VectorNorms diff = distance(cpu_input, + gpu_output, + params.olength(), + params.nbatch, + params.precision, + contiguous_params.itype, + contiguous_params.istride, + contiguous_params.idist, + params.otype, + params.ostride, + params.odist, + linf_failures.get(), + linf_cutoff, + {0}, + params.ooffset, + 1.0 / total_length); + + if(verbose > 1) + { + std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; + std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; + std::cout << "GPU linf norm failures:"; + std::sort(linf_failures->begin(), linf_failures->end()); + for(const auto& i : *linf_failures) + { + std::cout << " (" << i.first << "," << i.second << ")"; + } + std::cout << std::endl; + } + + EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); + EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); + + switch(params.precision) + { + case fft_precision_half: + max_linf_eps_half + = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); + max_l2_eps_half + = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); + break; + case fft_precision_single: + max_linf_eps_single + = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); + max_l2_eps_single + = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); + break; + case fft_precision_double: + max_linf_eps_double + = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); + max_l2_eps_double + = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); + break; + } + + if(verbose > 1) + { + std::cout << "L2 diff: " << diff.l_2 << "\n"; + std::cout << "Linf diff: " << diff.l_inf << "\n"; + } + + EXPECT_TRUE(diff.l_inf <= linf_cutoff) + << "Linf test failed. Linf:" << diff.l_inf + << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff + << params.str(); + + EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2 + < sqrt(log2(total_length)) * type_epsilon(params.precision)) + << "L2 test failed. L2: " << diff.l_2 + << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2 + << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) + << params.str(); +} + +// RAII type to put data into the cache when this object leaves scope +struct StoreCPUDataToCache +{ + StoreCPUDataToCache(std::vector& cpu_input, std::vector& cpu_output) + : cpu_input(cpu_input) + , cpu_output(cpu_output) + { + } + ~StoreCPUDataToCache() + { + last_cpu_fft_data.cpu_output.swap(cpu_output); + last_cpu_fft_data.cpu_input.swap(cpu_input); + } + std::vector& cpu_input; + std::vector& cpu_output; +}; + +// run CPU + rocFFT transform with the given params and compare +template +inline void fft_vs_reference_impl(Tparams& params, bool round_trip) +{ + // Call hipGetLastError to reset any errors + // returned by previous HIP runtime API calls. + hipError_t hip_status = hipGetLastError(); + + // Make sure that the parameters make sense: + ASSERT_TRUE(params.valid(verbose)); + + size_t needed_ram = needed_ram_buffers(params, verbose); + + if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) + { + GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb + << ".\n"; + } + + auto ibuffer_sizes = params.ibuffer_sizes(); + auto obuffer_sizes = params.obuffer_sizes(); + + size_t vram_avail = 0; + + if(vramgb == 0) + { + // Check free and total available memory: + size_t free = 0; + size_t total = 0; + auto hip_status = hipMemGetInfo(&free, &total); + if(hip_status != hipSuccess || total == 0) + { + ++n_hip_failures; + std::stringstream ss; + if(total == 0) + ss << "hipMemGetInfo claims there there isn't any vram"; + else + ss << "hipMemGetInfo failure with error " << hip_status; + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } + } + vram_avail = total; + } + else + { + vram_avail = vramgb * ONE_GiB; + } + + // First try a quick estimation of vram footprint, to speed up skipping tests + // that are too large to fit in the gpu (no plan created with the rocFFT backend) + const auto raw_vram_footprint + = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); + + if(!vram_fits_problem(raw_vram_footprint, vram_avail)) + { + GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint) + << " GiB) raw data too large for device"; + } + + if(verbose > 2) + { + std::cout << "Raw problem size: " << raw_vram_footprint << std::endl; + } + + // If it passed the quick estimation test, go for the more + // accurate calculation that actually creates the plan and + // take into account the work buffer size + const auto vram_footprint = params.vram_footprint(); + if(!vram_fits_problem(vram_footprint, vram_avail)) + { + if(verbose) + { + std::cout << "Problem raw data won't fit on device; skipped." << std::endl; + } + GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint) + << " GiB) raw data too large for device"; + } + + // Create FFT plan - this will also allocate work buffer, but + // will throw a specific exception if that step fails + auto plan_status = fft_status_success; + try + { + plan_status = params.create_plan(); + } + catch(fft_params::work_buffer_alloc_failure& e) + { + ++n_hip_failures; + std::stringstream ss; + ss << "Work buffer allocation failed with size: " << params.workbuffersize; + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } + } + ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed"; + + if(!vram_fits_problem(vram_footprint, vram_avail)) + { + if(verbose) + { + std::cout << "Problem won't fit on device; skipped." << std::endl; + } + GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device"; + return; + } + + fft_params contiguous_params; + contiguous_params.length = params.length; + contiguous_params.precision = params.precision; + contiguous_params.placement = fft_placement_notinplace; + contiguous_params.transform_type = params.transform_type; + contiguous_params.nbatch = params.nbatch; + contiguous_params.itype = contiguous_itype(params.transform_type); + contiguous_params.otype = contiguous_otype(contiguous_params.transform_type); + + contiguous_params.validate(); + + if(!contiguous_params.valid(verbose)) + { + throw std::runtime_error("Invalid contiguous params"); + } + + if(verbose > 3) + { + std::cout << "CPU params:\n"; + std::cout << contiguous_params.str("\n\t") << std::endl; + } + + std::vector ibuffer(ibuffer_sizes.size()); + std::vector pibuffer(ibuffer_sizes.size()); + for(unsigned int i = 0; i < ibuffer.size(); ++i) + { + hip_status = ibuffer[i].alloc(ibuffer_sizes[i]); + if(hip_status != hipSuccess) + { + std::stringstream ss; + ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "(" + << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)" + << " with code " << hipError_to_string(hip_status); + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } + } + pibuffer[i] = ibuffer[i].data(); + } + + // allocation counts in elements, ibuffer_sizes is in bytes + auto ibuffer_sizes_elems = ibuffer_sizes; + for(auto& buf : ibuffer_sizes_elems) + buf /= var_size(params.precision, params.itype); + + // Check cache first - nbatch is a >= comparison because we compute + // the largest batch size and cache it. Smaller batch runs can + // compare against the larger data. + std::vector cpu_input; + std::vector cpu_output; + std::shared_future convert_cpu_output_precision; + std::shared_future convert_cpu_input_precision; + bool run_fftw = true; + std::unique_ptr store_to_cache; + if(fftw_compare && last_cpu_fft_data.length == params.length + && last_cpu_fft_data.transform_type == params.transform_type + && last_cpu_fft_data.run_callbacks == params.run_callbacks) + { + if(last_cpu_fft_data.nbatch >= params.nbatch) + { + // use the cached input/output + cpu_input.swap(last_cpu_fft_data.cpu_input); + cpu_output.swap(last_cpu_fft_data.cpu_output); + run_fftw = false; + + store_to_cache = std::make_unique(cpu_input, cpu_output); + + if(params.precision != last_cpu_fft_data.precision) + { + // Tests should be ordered so we do wider first, then narrower. + switch(params.precision) + { + case fft_precision_double: + std::cerr + << "test ordering is incorrect: double precision follows a narrower one" + << std::endl; + abort(); + break; + case fft_precision_single: + if(last_cpu_fft_data.precision != fft_precision_double) + { + std::cerr + << "test ordering is incorrect: float precision follows a narrower one" + << std::endl; + abort(); + } + // convert the input/output to single-precision + convert_cpu_output_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_output.front()); + }); + convert_cpu_input_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_input.front()); + }); + break; + case fft_precision_half: + // convert to half precision + if(last_cpu_fft_data.precision == fft_precision_double) + { + convert_cpu_output_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_output.front()); + }); + convert_cpu_input_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_input.front()); + }); + } + else if(last_cpu_fft_data.precision == fft_precision_single) + { + convert_cpu_output_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_output.front()); + }); + convert_cpu_input_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_input.front()); + }); + } + else + { + std::cerr << "unhandled previous precision, cannot convert to half" + << std::endl; + abort(); + } + break; + } + last_cpu_fft_data.precision = params.precision; + } + } + // If the last result has a smaller batch than the new + // params, that might be a developer error - tests should be + // ordered to generate the bigger batch first. But if tests + // got filtered or skipped due to insufficient memory, we + // might never have tried to generate the bigger batch first. + // So just fall through and redo the CPU FFT. + } + else + { + // Clear cache explicitly so that even if we didn't get a hit, + // we're not uselessly holding on to cached cpu input/output + last_cpu_fft_data = last_cpu_fft_cache(); + } + + // Allocate CPU input + if(run_fftw) + { + cpu_input = allocate_cpu_fft_buffer( + contiguous_params.precision, contiguous_params.itype, contiguous_params.isize); + } + + // Create FFTW plan - this may write to input, but that's fine + // since there's nothing in there right now + typename fftw_trait::fftw_plan_type cpu_plan = nullptr; + if(run_fftw) + { + // Normally, we would want to defer allocation of CPU output + // buffer until when we actually do the CPU FFT. But if we're + // using FFTW wisdom, FFTW needs an output buffer at plan + // creation time. + if(use_fftw_wisdom) + { + cpu_output = allocate_cpu_fft_buffer( + contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); + } + cpu_plan = fftw_plan_via_rocfft(contiguous_params.length, + contiguous_params.istride, + contiguous_params.ostride, + contiguous_params.nbatch, + contiguous_params.idist, + contiguous_params.odist, + contiguous_params.transform_type, + cpu_input, + cpu_output); + + needed_ram += needed_ram_fftw(contiguous_params, cpu_plan, verbose); + + if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) + { + if(verbose) + { + std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]." + << std::endl; + } + GTEST_SKIP(); + return; + } + } + + std::vector gpu_input_data; + + // allocate and populate the input buffer (cpu/gpu) + if(run_fftw) + { + gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); + + //generate the input directly on the gpu + params.compute_input(ibuffer); + + // Copy the input to CPU + if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride + || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) + { + // Copy input to CPU + for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) + { + hip_status = hipMemcpy(gpu_input_data.at(idx).data(), + ibuffer[idx].data(), + ibuffer_sizes[idx], + hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; + } + } + } + + copy_buffers(gpu_input_data, + cpu_input, + params.ilength(), + params.nbatch, + params.precision, + params.itype, + params.istride, + params.idist, + contiguous_params.itype, + contiguous_params.istride, + contiguous_params.idist, + params.ioffset, + contiguous_params.ioffset); + } + else + { + // Copy input to CPU + for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) + { + hip_status = hipMemcpy(cpu_input.at(idx).data(), + ibuffer[idx].data(), + ibuffer_sizes[idx], + hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; + } + } + } + } + } + else if(fftw_compare) + { + gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); + + // In case the cached cpu input needed conversion, wait for it + if(convert_cpu_input_precision.valid()) + convert_cpu_input_precision.get(); + + // gets a pre-computed gpu input buffer from the cpu cache + std::vector* gpu_input = &cpu_input; + + if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride + || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) + { + copy_buffers(cpu_input, + gpu_input_data, + params.ilength(), + params.nbatch, + params.precision, + contiguous_params.itype, + contiguous_params.istride, + contiguous_params.idist, + params.itype, + params.istride, + params.idist, + {0}, + params.ioffset); + gpu_input = &gpu_input_data; + } + + // Copy input to GPU + for(unsigned int idx = 0; idx < gpu_input->size(); ++idx) + { + hip_status = hipMemcpy(ibuffer[idx].data(), + gpu_input->at(idx).data(), + ibuffer_sizes[idx], + hipMemcpyHostToDevice); + + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; + } + } + } + } + + if(verbose > 3) + { + std::cout << "CPU input:\n"; + contiguous_params.print_ibuffer(cpu_input); + } + + // compute input norm + std::shared_future cpu_input_norm; + if(fftw_compare) + cpu_input_norm = std::async(std::launch::async, [&]() { + // in case the cached cpu input needed conversion, wait for it + if(convert_cpu_input_precision.valid()) + convert_cpu_input_precision.get(); + + auto input_norm = norm(cpu_input, + contiguous_params.ilength(), + contiguous_params.nbatch, + contiguous_params.precision, + contiguous_params.itype, + contiguous_params.istride, + contiguous_params.idist, + contiguous_params.ioffset); + if(verbose > 2) + { + std::cout << "CPU Input Linf norm: " << input_norm.l_inf << "\n"; + std::cout << "CPU Input L2 norm: " << input_norm.l_2 << "\n"; + } + return input_norm; + }); + + std::vector obuffer_data; + std::vector* obuffer = &obuffer_data; + std::vector pobuffer; + + // allocate the output buffer + + if(params.placement == fft_placement_inplace) + { + obuffer = &ibuffer; + } + else + { + auto obuffer_sizes = params.obuffer_sizes(); + obuffer_data.resize(obuffer_sizes.size()); + for(unsigned int i = 0; i < obuffer_data.size(); ++i) + { + hip_status = obuffer_data[i].alloc(obuffer_sizes[i]); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + std::stringstream ss; + ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] + << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)" + << " with code " << hipError_to_string(hip_status); + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } + } + + // If we're validating output strides, init the + // output buffer to a known pattern and we can check + // that the pattern is untouched in places that + // shouldn't have been touched. + if(params.check_output_strides) + { + hip_status + = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemset failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemset failure with error " << hip_status; + } + } + } + } + } + pobuffer.resize(obuffer->size()); + for(unsigned int i = 0; i < obuffer->size(); ++i) + { + pobuffer[i] = obuffer->at(i).data(); + } + + // Run CPU transform + // + // NOTE: This must happen after input is copied to GPU and input + // norm is computed, since the CPU FFT may overwrite the input. + VectorNorms cpu_output_norm; + std::shared_future cpu_fft; + if(fftw_compare) + cpu_fft = std::async(std::launch::async, [&]() { + // wait for input norm to finish, since we might overwrite input + cpu_input_norm.get(); + + if(run_fftw) + execute_cpu_fft(params, contiguous_params, cpu_plan, cpu_input, cpu_output); + // in case the cached cpu output needed conversion, wait for it + else if(convert_cpu_output_precision.valid()) + convert_cpu_output_precision.get(); + + if(verbose > 3) + { + std::cout << "CPU output:\n"; + contiguous_params.print_obuffer(cpu_output); + } + + cpu_output_norm = norm(cpu_output, + params.olength(), + params.nbatch, + params.precision, + contiguous_params.otype, + contiguous_params.ostride, + contiguous_params.odist, + contiguous_params.ooffset); + if(verbose > 2) + { + std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n"; + std::cout << "CPU Output L2 norm: " << cpu_output_norm.l_2 << "\n"; + } + }); + + // scatter data out to multi-GPUs if this is a multi-GPU test + params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer); + + // execute GPU transform + std::vector gpu_output + = allocate_host_buffer(params.precision, params.otype, params.osize); + + execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output); + + params.free(); + + if(params.check_output_strides) + { + check_output_strides(gpu_output, params); + } + + // compute GPU output norm + std::shared_future gpu_norm; + if(fftw_compare) + gpu_norm = std::async(std::launch::async, [&]() { + return norm(gpu_output, + params.olength(), + params.nbatch, + params.precision, + params.otype, + params.ostride, + params.odist, + params.ooffset); + }); + + // compare output + // + // Compute the l-infinity and l-2 distance between the CPU and GPU output: + // wait for cpu FFT so we can compute cutoff + + const auto total_length = std::accumulate(params.length.begin(), + params.length.end(), + static_cast(1), + std::multiplies()); + + std::unique_ptr>> linf_failures; + if(verbose > 1) + linf_failures = std::make_unique>>(); + double linf_cutoff; + VectorNorms diff; + + std::shared_future compare_output; + if(fftw_compare) + compare_output = std::async(std::launch::async, [&]() { + cpu_fft.get(); + linf_cutoff + = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length); + + diff = distance(cpu_output, + gpu_output, + params.olength(), + params.nbatch, + params.precision, + contiguous_params.otype, + contiguous_params.ostride, + contiguous_params.odist, + params.otype, + params.ostride, + params.odist, + linf_failures.get(), + linf_cutoff, + {0}, + params.ooffset); + }); + + // Update the cache if this current transform is different from + // what's stored. But if this transform only has a smaller batch + // than what's cached, we can still keep the cache around since + // the input/output we already have is still valid. + const bool update_last_cpu_fft_data + = last_cpu_fft_data.length != params.length + || last_cpu_fft_data.transform_type != params.transform_type + || last_cpu_fft_data.run_callbacks != params.run_callbacks + || last_cpu_fft_data.precision != params.precision + || params.nbatch > last_cpu_fft_data.nbatch; + + // store cpu output in cache + if(update_last_cpu_fft_data) + { + last_cpu_fft_data.length = params.length; + last_cpu_fft_data.nbatch = params.nbatch; + last_cpu_fft_data.transform_type = params.transform_type; + last_cpu_fft_data.run_callbacks = params.run_callbacks; + last_cpu_fft_data.precision = params.precision; + } + + if(compare_output.valid()) + compare_output.get(); + + if(!store_to_cache) + store_to_cache = std::make_unique(cpu_input, cpu_output); + + Tparams params_inverse; + + if(round_trip) + { + params_inverse.inverse_from_forward(params); + + run_round_trip_inverse( + params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data); + } + + if(fftw_compare) + { + ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2)); + ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf)); + + ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2)); + ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf)); + + if(verbose > 1) + { + std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; + std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; + std::cout << "GPU linf norm failures:"; + std::sort(linf_failures->begin(), linf_failures->end()); + for(const auto& i : *linf_failures) + { + std::cout << " (" << i.first << "," << i.second << ")"; + } + std::cout << std::endl; + } + + EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); + EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); + } + + switch(params.precision) + { + case fft_precision_half: + max_linf_eps_half + = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); + max_l2_eps_half + = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); + break; + case fft_precision_single: + max_linf_eps_single + = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); + max_l2_eps_single = std::max(max_l2_eps_single, + diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); + break; + case fft_precision_double: + max_linf_eps_double + = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); + max_l2_eps_double = std::max(max_l2_eps_double, + diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); + break; + } + + if(verbose > 1) + { + std::cout << "L2 diff: " << diff.l_2 << "\n"; + std::cout << "Linf diff: " << diff.l_inf << "\n"; + } + + if(fftw_compare) + { + EXPECT_TRUE(diff.l_inf <= linf_cutoff) + << "Linf test failed. Linf:" << diff.l_inf + << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf + << "\tcutoff: " << linf_cutoff << params.str(); + + EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2 + < sqrt(log2(total_length)) * type_epsilon(params.precision)) + << "L2 test failed. L2: " << diff.l_2 + << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2 + << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) + << params.str(); + } + + if(round_trip && fftw_compare) + { + compare_round_trip_inverse(params_inverse, + contiguous_params, + gpu_input_data, + cpu_input, + cpu_input_norm.get(), + total_length); + } +} + +#endif diff --git a/shared/arithmetic.h b/shared/arithmetic.h new file mode 100644 index 0000000..774d342 --- /dev/null +++ b/shared/arithmetic.h @@ -0,0 +1,61 @@ +/****************************************************************************** +* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +* THE SOFTWARE. +*******************************************************************************/ + +#pragma once + +#include +#include + +// arithmetic helper functions + +static inline bool IsPo2(size_t u) +{ + return (u != 0) && (0 == (u & (u - 1))); +} + +// help function: Find the smallest power of 2 that is >= n; return its +// power of 2 factor +// e.g., CeilPo2 (7) returns 3 : (2^3 >= 7) +static inline size_t CeilPo2(size_t n) +{ + size_t v = 1, t = 0; + while(v < n) + { + v <<= 1; + t++; + } + + return t; +} + +template +static inline T DivRoundingUp(T a, T b) +{ + return (a + (b - 1)) / b; +} + +template +typename Titer::value_type product(Titer begin, Titer end) +{ + return std::accumulate( + begin, end, typename Titer::value_type(1), std::multiplies()); +} diff --git a/shared/array_predicate.h b/shared/array_predicate.h new file mode 100644 index 0000000..92e45b4 --- /dev/null +++ b/shared/array_predicate.h @@ -0,0 +1,47 @@ +// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_ARRAY_PREDICATE_H +#define ROCFFT_ARRAY_PREDICATE_H + +#include "rocfft/rocfft.h" + +namespace +{ + bool array_type_is_complex(rocfft_array_type type) + { + return type == rocfft_array_type_complex_interleaved + || type == rocfft_array_type_complex_planar + || type == rocfft_array_type_hermitian_interleaved + || type == rocfft_array_type_hermitian_planar; + } + bool array_type_is_interleaved(rocfft_array_type type) + { + return type == rocfft_array_type_complex_interleaved + || type == rocfft_array_type_hermitian_interleaved; + } + bool array_type_is_planar(rocfft_array_type type) + { + return type == rocfft_array_type_complex_planar + || type == rocfft_array_type_hermitian_planar; + } +} + +#endif diff --git a/shared/array_validator.cpp b/shared/array_validator.cpp new file mode 100644 index 0000000..70abb08 --- /dev/null +++ b/shared/array_validator.cpp @@ -0,0 +1,549 @@ +// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include +#include +#include + +#include "array_validator.h" +#include "increment.h" + +// Check a 2D array for collisions. +// The 2D case can be determined via a number-theoretic argument. +bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1) +{ + if(s0 == s1) + return false; + const auto c = std::lcm(s0, s1); + return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c)); +} + +// Compare a 1D direction with a multi-index hyperface for collisions. +bool valid_length_stride_1d_multi(const unsigned int idx, + const std::vector l, + const std::vector s, + const int verbose) +{ + size_t l0{0}, s0{0}; + std::vector l1{}, s1{}; + for(unsigned int i = 0; i < l.size(); ++i) + { + if(i == idx) + { + l0 = l[i]; + s0 = s[i]; + } + else + { + l1.push_back(l[i]); + s1.push_back(s[i]); + } + } + + if(verbose > 4) + { + std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl; + } + + // We only need to go to the maximum pointer offset for (l1,s1). + const auto max_offset + = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies()) + - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0); + std::unordered_set a0{}; + for(size_t i = 1; i < l0; ++i) + { + const auto val = i * s0; + if(val <= max_offset) + a0.insert(val); + else + break; + } + + if(verbose > 5) + { + std::cout << "a0:"; + for(auto i : a0) + std::cout << " " << i; + std::cout << std::endl; + + std::cout << "l1:"; + for(auto i : l1) + std::cout << " " << i; + std::cout << std::endl; + + std::cout << "s1:"; + for(auto i : s1) + std::cout << " " << i; + std::cout << std::endl; + } + + // TODO: this can be multi-threaded, since find(...) is thread-safe. + std::vector index(l1.size()); + std::fill(index.begin(), index.end(), 0); + do + { + const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0); + if(i > 0 && (i % s0 == 0)) + { + // TODO: use an ordered set and binary search + if(verbose > 6) + std::cout << i << std::endl; + if(a0.find(i) != a0.end()) + { + if(verbose > 4) + { + std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl; + std::cout << "l1:"; + for(const auto li : l1) + std::cout << " " << li; + std::cout << " s1:"; + for(const auto si : s1) + std::cout << " " << si; + std::cout << std::endl; + std::cout << "Found duplicate: " << i << std::endl; + } + return false; + } + } + } while(increment_rowmajor(index, l1)); + + return true; +} + +// Compare a hyperface with another hyperface for collisions. +bool valid_length_stride_multi_multi(const std::vector l0, + const std::vector s0, + const std::vector l1, + const std::vector s1) +{ + std::unordered_set a0{}; + + const auto max_offset + = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies()) + - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0); + std::vector index0(l0.size()); // TODO: check this + std::fill(index0.begin(), index0.end(), 0); + do + { + const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0); + if(i > max_offset) + a0.insert(i); + } while(increment_rowmajor(index0, l0)); + + std::vector index1(l1.size()); + std::fill(index1.begin(), index1.end(), 0); + do + { + const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0); + if(i > 0) + { + // TODO: use an ordered set and binary search + if(a0.find(i) != a0.end()) + { + + return false; + } + } + } while(increment_rowmajor(index1, l1)); + + return true; +} + +bool valid_length_stride_3d(const std::vector& l, + const std::vector& s, + const int verbose) +{ + // Check that 2D faces are valid: + if(!valid_length_stride_2d(l[0], l[1], s[0], s[1])) + return false; + if(!valid_length_stride_2d(l[0], l[2], s[0], s[2])) + return false; + if(!valid_length_stride_2d(l[1], l[2], s[1], s[2])) + return false; + + // If the 2D faces are valid, check an axis vs a face for collisions: + bool invalid = false; +#ifdef _OPENMP +#pragma omp parallel for +#endif + for(int idx = 0; idx < 3; ++idx) + { + if(!valid_length_stride_1d_multi(idx, l, s, verbose)) + { +#ifdef _OPENMP +#pragma omp cancel for +#endif + invalid = true; + } + } + if(invalid) + return false; + return true; +} + +bool valid_length_stride_4d(const std::vector& l, + const std::vector& s, + const int verbose) +{ + if(l.size() != 4) + { + throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d"); + } + + // Check that 2D faces are valid: + for(int idx0 = 0; idx0 < 3; ++idx0) + { + for(int idx1 = idx0 + 1; idx1 < 4; ++idx1) + { + if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1])) + return false; + } + } + + bool invalid = false; + // Check that 1D vs 3D faces are valid: +#ifdef _OPENMP +#pragma omp parallel for +#endif + for(int idx0 = 0; idx0 < 4; ++idx0) + { + if(!valid_length_stride_1d_multi(idx0, l, s, verbose)) + { +#ifdef _OPENMP +#pragma omp cancel for +#endif + invalid = true; + } + } + if(invalid) + return false; + + // Check that 2D vs 2D faces are valid: + + // First, get all the permutations + std::vector> perms; + std::vector v(l.size()); + std::fill(v.begin(), v.begin() + 2, 0); + std::fill(v.begin() + 2, v.end(), 1); + do + { + perms.push_back(v); + if(verbose > 3) + { + std::cout << "v:"; + for(const auto i : v) + { + std::cout << " " << i; + } + std::cout << "\n"; + } + } while(std::next_permutation(v.begin(), v.end())); + + // Then loop over all of the permutations. +#ifdef _OPENMP +#pragma omp parallel for +#endif + for(size_t iperm = 0; iperm < perms.size(); ++iperm) + { + std::vector l0(2); + std::vector s0(2); + std::vector l1(2); + std::vector s1(2); + for(size_t i = 0; i < l.size(); ++i) + { + if(perms[iperm][i] == 0) + { + l0.push_back(l[i]); + s0.push_back(s[i]); + } + else + { + l1.push_back(l[i]); + s1.push_back(s[i]); + } + } + + if(verbose > 3) + { + std::cout << "\tl0:"; + for(const auto i : l0) + { + std::cout << " " << i; + } + std::cout << "\n"; + std::cout << "\ts0:"; + for(const auto i : s0) + { + std::cout << " " << i; + } + std::cout << "\n"; + std::cout << "\tl1:"; + for(const auto i : l1) + { + std::cout << " " << i; + } + std::cout << "\n"; + std::cout << "\ts1:"; + for(const auto i : s1) + { + std::cout << " " << i; + } + std::cout << "\n"; + } + + if(!valid_length_stride_multi_multi(l0, s0, l1, s1)) + { +#ifdef _OPENMP +#pragma omp cancel for +#endif + invalid = true; + } + } + if(invalid) + return false; + + return true; +} + +bool valid_length_stride_generald(const std::vector l, + const std::vector s, + const int verbose) +{ + if(verbose > 2) + { + std::cout << "checking dimension " << l.size() << std::endl; + } + + // Recurse on d-1 hyper-faces: + for(unsigned int idx = 0; idx < l.size(); ++idx) + { + std::vector l0{}; + std::vector s0{}; + for(size_t i = 0; i < l.size(); ++i) + { + if(i != idx) + { + l0.push_back(l[i]); + s0.push_back(s[i]); + } + } + if(!array_valid(l0, s0, verbose)) + return false; + } + + // Handle the 1D vs (N-1) case: + for(unsigned int idx = 0; idx < l.size(); ++idx) + { + if(!valid_length_stride_1d_multi(idx, l, s, verbose)) + return false; + } + + for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0) + { + const size_t dim1 = l.size() - dim0; + if(verbose > 2) + std::cout << "dims: " << dim0 << " " << dim1 << std::endl; + + // We iterate over all permutations of an array of length l.size() which contains dim0 zeros + // and dim1 ones. We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the + // possibilities. + + // First, get all the permutations + std::vector> perms; + std::vector v(l.size()); + std::fill(v.begin(), v.begin() + dim1, 0); + std::fill(v.begin() + dim1, v.end(), 1); + do + { + perms.push_back(v); + if(verbose > 3) + { + std::cout << "v:"; + for(const auto i : v) + { + std::cout << " " << i; + } + std::cout << "\n"; + } + + } while(std::next_permutation(v.begin(), v.end())); + + bool invalid = false; + // Then loop over all of the permutations. +#ifdef _OPENMP +#pragma omp parallel for +#endif + for(size_t iperm = 0; iperm < perms.size(); ++iperm) + { + std::vector l0(dim0); + std::vector s0(dim0); + std::vector l1(dim1); + std::vector s1(dim1); + + for(size_t i = 0; i < l.size(); ++i) + { + if(v[i] == 0) + { + l0.push_back(l[i]); + s0.push_back(s[i]); + } + else + { + l1.push_back(l[i]); + s1.push_back(s[i]); + } + } + + if(verbose > 3) + { + std::cout << "\tl0:"; + for(const auto i : l0) + { + std::cout << " " << i; + } + std::cout << "\n"; + std::cout << "\ts0:"; + for(const auto i : s0) + { + std::cout << " " << i; + } + std::cout << "\n"; + std::cout << "\tl1:"; + for(const auto i : l1) + { + std::cout << " " << i; + } + std::cout << "\n"; + std::cout << "\ts1:"; + for(const auto i : s1) + { + std::cout << " " << i; + } + std::cout << "\n"; + } + + if(!valid_length_stride_multi_multi(l0, s0, l1, s1)) + { +#ifdef _OPENMP +#pragma omp cancel for +#endif + invalid = true; + } + } + if(invalid) + return false; + } + + return true; +} + +bool sort_by_stride(const std::pair& ls0, const std::pair& ls1) +{ + return ls0.second < ls1.second; +} + +bool array_valid(const std::vector& length, + const std::vector& stride, + const int verbose) +{ + if(length.size() != stride.size()) + return false; + + // If a length is 1, then the stride is irrelevant. + // If a length is > 1, then the corresponding stride must be > 1. + std::vector l{}, s{}; + for(unsigned int i = 0; i < length.size(); ++i) + { + if(length[i] > 1) + { + if(stride[i] == 0) + return false; + l.push_back(length[i]); + s.push_back(stride[i]); + } + } + + if(length.size() > 1) + { + // Check happy path. + bool happy_path = true; + std::vector> ls; + for(size_t idx = 0; idx < length.size(); ++idx) + { + ls.push_back(std::pair(length[idx], stride[idx])); + } + std::sort(ls.begin(), ls.end(), sort_by_stride); + + if(verbose > 2) + { + for(size_t idx = 0; idx < ls.size(); ++idx) + { + std::cout << ls[idx].first << "\t" << ls[idx].second << "\n"; + } + } + + for(size_t idx = 1; idx < ls.size(); ++idx) + { + if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second) + { + happy_path = false; + break; + } + } + if(happy_path) + { + if(verbose > 2) + { + std::cout << "happy path\n"; + } + return true; + } + } + + switch(l.size()) + { + case 0: + return true; + break; + case 1: + return s[0] != 0; + break; + case 2: + { + return valid_length_stride_2d(l[0], l[1], s[0], s[1]); + break; + } + case 3: + { + return valid_length_stride_3d(l, s, verbose); + break; + } + case 4: + { + return valid_length_stride_4d(l, s, verbose); + break; + } + default: + return valid_length_stride_generald(l, s, verbose); + return true; + } + + return true; +} diff --git a/shared/array_validator.h b/shared/array_validator.h new file mode 100644 index 0000000..ce85173 --- /dev/null +++ b/shared/array_validator.h @@ -0,0 +1,31 @@ +// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ARRAY_VALIDATOR_H +#define ARRAY_VALIDATOR_H + +#include + +// Checks whether the array with given length and stride has multi-index collisions. +bool array_valid(const std::vector& length, + const std::vector& stride, + const int verbose = 0); + +#endif diff --git a/shared/concurrency.h b/shared/concurrency.h new file mode 100644 index 0000000..a36c7c1 --- /dev/null +++ b/shared/concurrency.h @@ -0,0 +1,41 @@ +// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include + +#ifndef WIN32 +#include +#endif + +// work out how many parallel tasks to run, based on available +// resources. on Linux, this will look at the cpu affinity mask (if +// available) which might be restricted in a container. otherwise, +// return std::thread::hardware_concurrency(). +static unsigned int rocfft_concurrency() +{ +#ifndef WIN32 + cpu_set_t cpuset; + if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0) + return CPU_COUNT(&cpuset); +#endif + return std::thread::hardware_concurrency(); +} diff --git a/shared/data_gen_device.h b/shared/data_gen_device.h new file mode 100644 index 0000000..77fb012 --- /dev/null +++ b/shared/data_gen_device.h @@ -0,0 +1,1303 @@ +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef DATA_GEN_DEVICE_H +#define DATA_GEN_DEVICE_H + +// rocRAND can generate warnings if inline asm is not available for +// some architectures. data generation isn't performance-critical, +// so just disable inline asm to prevent the warnings. +#define ROCRAND_DISABLE_INLINE_ASM + +#include "../shared/arithmetic.h" +#include "../shared/device_properties.h" +#include "../shared/gpubuf.h" +#include "../shared/increment.h" +#include "../shared/rocfft_complex.h" +#include +#include +#include +#include +#include +#include + +static const unsigned int DATA_GEN_THREADS = 8; +static const unsigned int DATA_GEN_GRID_Y_MAX = 64; + +template +struct input_val_1D +{ + T val1; +}; + +template +struct input_val_2D +{ + T val1; + T val2; +}; + +template +struct input_val_3D +{ + T val1; + T val2; + T val3; +}; + +template +static input_val_1D get_input_val(const T& val) +{ + return input_val_1D{val}; +} + +template +static input_val_2D get_input_val(const std::tuple& val) +{ + return input_val_2D{std::get<0>(val), std::get<1>(val)}; +} + +template +static input_val_3D get_input_val(const std::tuple& val) +{ + return input_val_3D{std::get<0>(val), std::get<1>(val), std::get<2>(val)}; +} + +template +__device__ static size_t + compute_index(const input_val_1D& length, const input_val_1D& stride, size_t base) +{ + return (length.val1 * stride.val1) + base; +} + +template +__device__ static size_t + compute_index(const input_val_2D& length, const input_val_2D& stride, size_t base) +{ + return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base; +} + +template +__device__ static size_t + compute_index(const input_val_3D& length, const input_val_3D& stride, size_t base) +{ + return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3) + + base; +} + +template +static inline input_val_1D make_zero_length(const input_val_1D& whole_length) +{ + return input_val_1D{0}; +} + +template +static inline input_val_2D make_zero_length(const input_val_2D& whole_length) +{ + return input_val_2D{0, 0}; +} + +template +static inline input_val_3D make_zero_length(const input_val_3D& whole_length) +{ + return input_val_3D{0, 0, 0}; +} + +template +static inline input_val_1D make_unit_stride(const input_val_1D& whole_length) +{ + return input_val_1D{1}; +} + +template +static inline input_val_2D make_unit_stride(const input_val_2D& whole_length) +{ + return input_val_2D{1, whole_length.val1}; +} + +template +static inline input_val_3D make_unit_stride(const input_val_3D& whole_length) +{ + return input_val_3D{1, whole_length.val1, whole_length.val1 * whole_length.val2}; +} + +template +__device__ static input_val_1D get_length(const size_t i, const input_val_1D& whole_length) +{ + auto xlen = whole_length.val1; + + auto xidx = i % xlen; + + return input_val_1D{xidx}; +} + +template +__device__ static input_val_2D get_length(const size_t i, const input_val_2D& whole_length) +{ + auto xlen = whole_length.val1; + auto ylen = whole_length.val2; + + auto xidx = i % xlen; + auto yidx = i / xlen % ylen; + + return input_val_2D{xidx, yidx}; +} + +template +__device__ static input_val_3D get_length(const size_t i, const input_val_3D& whole_length) +{ + auto xlen = whole_length.val1; + auto ylen = whole_length.val2; + auto zlen = whole_length.val3; + + auto xidx = i % xlen; + auto yidx = i / xlen % ylen; + auto zidx = i / xlen / ylen % zlen; + + return input_val_3D{xidx, yidx, zidx}; +} + +template +__device__ static size_t get_batch(const size_t i, const input_val_1D& whole_length) +{ + auto xlen = whole_length.val1; + + auto yidx = i / xlen; + + return yidx; +} + +template +__device__ static size_t get_batch(const size_t i, const input_val_2D& whole_length) +{ + auto xlen = whole_length.val1; + auto ylen = whole_length.val2; + + auto zidx = i / xlen / ylen; + + return zidx; +} + +template +__device__ static size_t get_batch(const size_t i, const input_val_3D& length) +{ + auto xlen = length.val1; + auto ylen = length.val2; + auto zlen = length.val3; + + auto widx = i / xlen / ylen / zlen; + + return widx; +} + +__device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset) +{ + return hiprand_uniform_double(gen_state) + offset; +} + +__device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset) +{ + return hiprand_uniform(gen_state) + offset; +} + +__device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset) +{ + return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset; +} + +template +__device__ static void set_imag_zero(const size_t pos, Tcomplex* x) +{ + x[pos].y = 0.0; +} + +template +__device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag) +{ + ximag[pos] = 0.0; +} + +template +__device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x) +{ + x[pos].x = x[cpos].x; + x[pos].y = -x[cpos].y; +} + +template +__device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag) +{ + xreal[pos] = xreal[cpos]; + ximag[pos] = -ximag[cpos]; +} + +template +__global__ static void __launch_bounds__(DATA_GEN_THREADS) + generate_random_interleaved_data_kernel(const Tint whole_length, + const Tint zero_length, + const size_t idist, + const size_t isize, + const Tint istride, + rocfft_complex* data) +{ + auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + + blockIdx.y * gridDim.x * DATA_GEN_THREADS; + static_assert(sizeof(i) >= sizeof(isize)); + if(i < isize) + { + auto i_length = get_length(i, whole_length); + auto i_batch = get_batch(i, whole_length); + auto i_base = i_batch * idist; + + auto seed = compute_index(zero_length, istride, i_base); + auto idx = compute_index(i_length, istride, i_base); + + hiprandStatePhilox4_32_10 gen_state; + hiprand_init(seed, idx, 0, &gen_state); + + data[idx].x = make_random_val(&gen_state, static_cast(-0.5)); + data[idx].y = make_random_val(&gen_state, static_cast(-0.5)); + } +} + +template +__global__ static void __launch_bounds__(DATA_GEN_THREADS) + generate_interleaved_data_kernel(const Tint whole_length, + const size_t idist, + const size_t isize, + const Tint istride, + const Tint ustride, + const Treal inv_scale, + rocfft_complex* data) +{ + auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + + blockIdx.y * gridDim.x * DATA_GEN_THREADS; + static_assert(sizeof(i) >= sizeof(isize)); + if(i < isize) + { + const auto i_length = get_length(i, whole_length); + const auto i_batch = get_batch(i, whole_length); + const auto i_base = i_batch * idist; + + const auto val = static_cast(-0.5) + + static_cast( + static_cast(compute_index(i_length, ustride, 0))) + * inv_scale; + + const auto idx = compute_index(i_length, istride, i_base); + + data[idx].x = val; + data[idx].y = val; + } +} + +template +__global__ static void __launch_bounds__(DATA_GEN_THREADS) + generate_random_planar_data_kernel(const Tint whole_length, + const Tint zero_length, + const size_t idist, + const size_t isize, + const Tint istride, + Treal* real_data, + Treal* imag_data) +{ + auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + + blockIdx.y * gridDim.x * DATA_GEN_THREADS; + static_assert(sizeof(i) >= sizeof(isize)); + if(i < isize) + { + auto i_length = get_length(i, whole_length); + auto i_batch = get_batch(i, whole_length); + auto i_base = i_batch * idist; + + auto seed = compute_index(zero_length, istride, i_base); + auto idx = compute_index(i_length, istride, i_base); + + hiprandStatePhilox4_32_10 gen_state; + hiprand_init(seed, idx, 0, &gen_state); + + real_data[idx] = make_random_val(&gen_state, static_cast(-0.5)); + imag_data[idx] = make_random_val(&gen_state, static_cast(-0.5)); + } +} + +template +__global__ static void __launch_bounds__(DATA_GEN_THREADS) + generate_planar_data_kernel(const Tint whole_length, + const size_t idist, + const size_t isize, + const Tint istride, + const Tint ustride, + const Treal inv_scale, + Treal* real_data, + Treal* imag_data) +{ + auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + + blockIdx.y * gridDim.x * DATA_GEN_THREADS; + static_assert(sizeof(i) >= sizeof(isize)); + if(i < isize) + { + const auto i_length = get_length(i, whole_length); + const auto i_batch = get_batch(i, whole_length); + const auto i_base = i_batch * idist; + + const auto val = static_cast(-0.5) + + static_cast( + static_cast(compute_index(i_length, ustride, 0))) + * inv_scale; + + const auto idx = compute_index(i_length, istride, i_base); + + real_data[idx] = val; + imag_data[idx] = val; + } +} + +template +__global__ static void __launch_bounds__(DATA_GEN_THREADS) + generate_random_real_data_kernel(const Tint whole_length, + const Tint zero_length, + const size_t idist, + const size_t isize, + const Tint istride, + Treal* data) +{ + auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + + blockIdx.y * gridDim.x * DATA_GEN_THREADS; + static_assert(sizeof(i) >= sizeof(isize)); + if(i < isize) + { + auto i_length = get_length(i, whole_length); + auto i_batch = get_batch(i, whole_length); + auto i_base = i_batch * idist; + + auto seed = compute_index(zero_length, istride, i_base); + auto idx = compute_index(i_length, istride, i_base); + + hiprandStatePhilox4_32_10 gen_state; + hiprand_init(seed, idx, 0, &gen_state); + + data[idx] = make_random_val(&gen_state, static_cast(-0.5)); + } +} + +template +__global__ static void __launch_bounds__(DATA_GEN_THREADS) + generate_real_data_kernel(const Tint whole_length, + const size_t idist, + const size_t isize, + const Tint istride, + const Tint ustride, + const Treal inv_scale, + Treal* data) +{ + auto const i = static_cast(threadIdx.x) + blockIdx.x * blockDim.x + + blockIdx.y * gridDim.x * DATA_GEN_THREADS; + static_assert(sizeof(i) >= sizeof(isize)); + if(i < isize) + { + const auto i_length = get_length(i, whole_length); + const auto i_batch = get_batch(i, whole_length); + const auto i_base = i_batch * idist; + + const auto val = static_cast(-0.5) + + static_cast( + static_cast(compute_index(i_length, ustride, 0))) + * inv_scale; + + const auto idx = compute_index(i_length, istride, i_base); + + data[idx] = val; + } +} + +// For complex-to-real transforms, the input data must be Hermitiam-symmetric. +// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier +// space. For multi-dimensional data, this means that we only need to store a bit more +// than half of the complex values; the rest are redundant. However, there are still +// some restrictions: +// * the origin and Nyquist value(s) must be real-valued +// * some of the remaining values are still redundant, and you might get different results +// than you expect if the values don't agree. + +template +__global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex* x, + const size_t Nx, + const size_t xstride, + const size_t dist, + const size_t batch_total, + const bool Nxeven) +{ + auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; + static_assert(sizeof(id_batch) == sizeof(size_t)); + + if(id_batch < batch_total) + { + id_batch *= dist; + + set_imag_zero(id_batch, x); + + if(Nxeven) + set_imag_zero(id_batch + (Nx / 2) * xstride, x); + } +} + +template +__global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat* xreal, + Tfloat* ximag, + const size_t Nx, + const size_t xstride, + const size_t dist, + const size_t batch_total, + const bool Nxeven) +{ + auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; + static_assert(sizeof(id_batch) == sizeof(size_t)); + + if(id_batch < batch_total) + { + id_batch *= dist; + + set_imag_zero(id_batch, xreal, ximag); + + if(Nxeven) + set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag); + } +} + +template +__global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex* x, + const size_t Nx, + const size_t Ny, + const size_t xstride, + const size_t ystride, + const size_t dist, + const size_t batch_total, + const size_t x_total, + const bool Nxeven, + const bool Nyeven) +{ + auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; + const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; + static_assert(sizeof(id_batch) == sizeof(size_t)); + static_assert(sizeof(id_x) == sizeof(size_t)); + + if(id_batch < batch_total) + { + id_batch *= dist; + + if(id_x == 0) + set_imag_zero(id_batch, x); + + if(id_x == 0 && Nxeven) + set_imag_zero(id_batch + (Nx / 2) * xstride, x); + + if(id_x == 0 && Nyeven) + set_imag_zero(id_batch + ystride * (Ny / 2), x); + + if(id_x == 0 && Nxeven && Nyeven) + set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x); + + if(id_x < x_total) + { + conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x); + + if(Nyeven) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), + id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), + x); + } + } +} + +template +__global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat* xreal, + Tfloat* ximag, + const size_t Nx, + const size_t Ny, + const size_t xstride, + const size_t ystride, + const size_t dist, + const size_t batch_total, + const size_t x_total, + const bool Nxeven, + const bool Nyeven) +{ + auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; + const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; + static_assert(sizeof(id_batch) == sizeof(size_t)); + static_assert(sizeof(id_x) == sizeof(size_t)); + + if(id_batch < batch_total) + { + id_batch *= dist; + + if(id_x == 0) + set_imag_zero(id_batch, xreal, ximag); + + if(id_x == 0 && Nxeven) + set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag); + + if(id_x == 0 && Nyeven) + set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag); + + if(id_x == 0 && Nxeven && Nyeven) + set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag); + + if(id_x < x_total) + { + conjugate(id_batch + xstride * (Nx - (id_x + 1)), + id_batch + xstride * (id_x + 1), + xreal, + ximag); + + if(Nyeven) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), + id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), + xreal, + ximag); + } + } +} + +template +__global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex* x, + const size_t Nx, + const size_t Ny, + const size_t Nz, + const size_t xstride, + const size_t ystride, + const size_t zstride, + const size_t dist, + const size_t batch_total, + const size_t x_total, + const size_t y_total, + const size_t y_total_half, + const bool Nxeven, + const bool Nyeven, + const bool Nzeven) +{ + auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; + const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; + const auto id_y = static_cast(threadIdx.z) + blockIdx.z * blockDim.z; + static_assert(sizeof(id_batch) == sizeof(size_t)); + static_assert(sizeof(id_x) == sizeof(size_t)); + static_assert(sizeof(id_y) == sizeof(size_t)); + + if(id_batch < batch_total) + { + auto id_x_y_zero = (id_x == 0 && id_y == 0); + + id_batch *= dist; + + if(id_x_y_zero) + set_imag_zero(id_batch, x); + + if(Nxeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2), x); + + if(Nyeven && id_x_y_zero) + set_imag_zero(id_batch + ystride * (Ny / 2), x); + + if(Nzeven && id_x_y_zero) + set_imag_zero(id_batch + zstride * (Nz / 2), x); + + if(Nxeven && Nyeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x); + + if(Nxeven && Nzeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x); + + if(Nyeven && Nzeven && id_x_y_zero) + set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x); + + if(Nxeven && Nyeven && Nzeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2), + x); + + if(id_x == 0 && id_y < y_total_half) + conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x); + + if(Nxeven && id_x == 0 && id_y < y_total_half) + conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)), + id_batch + xstride * (Nx / 2) + ystride * (id_y + 1), + x); + + if(id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x); + + if(Nyeven && id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), + id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), + x); + + if(id_x < x_total && id_y < y_total) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)), + id_batch + xstride * (id_x + 1) + ystride * (id_y + 1), + x); + + if(Nzeven) + { + if(id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), + id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), + x); + + if(Nyeven && id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), + id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), + x); + + if(id_x == 0 && id_y < y_total_half) + conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), + id_batch + ystride * (id_y + 1) + zstride * (Nz / 2), + x); + + if(Nxeven && id_x == 0 && id_y < y_total_half) + conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)) + + zstride * (Nz / 2), + id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2), + x); + + if(id_x < x_total && id_y < y_total) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)) + + zstride * (Nz / 2), + id_batch + xstride * (id_x + 1) + ystride * (id_y + 1) + + zstride * (Nz / 2), + x); + } + } +} + +template +__global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat* xreal, + Tfloat* ximag, + const size_t Nx, + const size_t Ny, + const size_t Nz, + const size_t xstride, + const size_t ystride, + const size_t zstride, + const size_t dist, + const size_t batch_total, + const size_t x_total, + const size_t y_total, + const size_t y_total_half, + const bool Nxeven, + const bool Nyeven, + const bool Nzeven) +{ + auto id_batch = static_cast(threadIdx.x) + blockIdx.x * blockDim.x; + const auto id_x = static_cast(threadIdx.y) + blockIdx.y * blockDim.y; + const auto id_y = static_cast(threadIdx.z) + blockIdx.z * blockDim.z; + static_assert(sizeof(id_batch) == sizeof(size_t)); + static_assert(sizeof(id_x) == sizeof(size_t)); + static_assert(sizeof(id_y) == sizeof(size_t)); + + if(id_batch < batch_total) + { + auto id_x_y_zero = (id_x == 0 && id_y == 0); + + id_batch *= dist; + + if(id_x_y_zero) + set_imag_zero(id_batch, xreal, ximag); + + if(Nxeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag); + + if(Nyeven && id_x_y_zero) + set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag); + + if(Nzeven && id_x_y_zero) + set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag); + + if(Nxeven && Nyeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag); + + if(Nxeven && Nzeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag); + + if(Nyeven && Nzeven && id_x_y_zero) + set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag); + + if(Nxeven && Nyeven && Nzeven && id_x_y_zero) + set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2), + xreal, + ximag); + + if(id_x == 0 && id_y < y_total_half) + conjugate(id_batch + ystride * (Ny - (id_y + 1)), + id_batch + ystride * (id_y + 1), + xreal, + ximag); + + if(Nxeven && id_x == 0 && id_y < y_total_half) + conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)), + id_batch + xstride * (Nx / 2) + ystride * (id_y + 1), + xreal, + ximag); + + if(id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)), + id_batch + xstride * (id_x + 1), + xreal, + ximag); + + if(Nyeven && id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), + id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), + xreal, + ximag); + + if(id_x < x_total && id_y < y_total) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)), + id_batch + xstride * (id_x + 1) + ystride * (id_y + 1), + xreal, + ximag); + + if(Nzeven) + { + if(id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), + id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), + xreal, + ximag); + + if(Nyeven && id_x < x_total && id_y == 0) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), + id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), + xreal, + ximag); + + if(id_x == 0 && id_y < y_total_half) + conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), + id_batch + ystride * (id_y + 1) + zstride * (Nz / 2), + xreal, + ximag); + + if(Nxeven && id_x == 0 && id_y < y_total_half) + conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)) + + zstride * (Nz / 2), + id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2), + xreal, + ximag); + + if(id_x < x_total && id_y < y_total) + conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)) + + zstride * (Nz / 2), + id_batch + xstride * (id_x + 1) + ystride * (id_y + 1) + + zstride * (Nz / 2), + xreal, + ximag); + } + } +} + +// get grid dimensions for data gen kernel +static dim3 generate_data_gridDim(const size_t isize) +{ + auto blockSize = DATA_GEN_THREADS; + // total number of blocks needed in the grid + auto numBlocks_setup = DivRoundingUp(isize, blockSize); + + // Total work items per dimension in the grid is counted in + // uint32_t. Since each thread initializes one element, very + // large amounts of data will overflow this total size if we do + // all this work in one grid dimension, causing launch failure. + // + // CUDA also generally allows for effectively unlimited grid X + // dim, but Y and Z are more limited. + auto gridDim_y = std::min(DATA_GEN_GRID_Y_MAX, numBlocks_setup); + auto gridDim_x = DivRoundingUp(numBlocks_setup, DATA_GEN_GRID_Y_MAX); + return {gridDim_x, gridDim_y}; +} + +// get grid dimensions for hermitian symmetrizer kernel +static dim3 generate_hermitian_gridDim(const std::vector& length, + const size_t batch, + const size_t blockSize) +{ + dim3 gridDim; + + switch(length.size()) + { + case 1: + gridDim = dim3(DivRoundingUp(batch, blockSize)); + break; + case 2: + gridDim = dim3(DivRoundingUp(batch, blockSize), + DivRoundingUp((length[0] + 1) / 2 - 1, blockSize)); + break; + case 3: + gridDim = dim3(DivRoundingUp(batch, blockSize), + DivRoundingUp((length[0] + 1) / 2 - 1, blockSize), + DivRoundingUp(length[1] - 1, blockSize)); + break; + default: + throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); + } + + return gridDim; +} + +static dim3 generate_blockDim(const std::vector& length, const size_t blockSize) +{ + dim3 blockDim; + + switch(length.size()) + { + case 1: + blockDim = dim3(blockSize); + break; + case 2: + blockDim = dim3(blockSize, blockSize); + break; + case 3: + blockDim = dim3(blockSize, blockSize, blockSize); + break; + default: + throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); + } + + return blockDim; +} + +template +static void generate_random_interleaved_data(const Tint& whole_length, + const size_t idist, + const size_t isize, + const Tint& whole_stride, + rocfft_complex* input_data, + const hipDeviceProp_t& deviceProp) +{ + auto input_length = get_input_val(whole_length); + auto zero_length = make_zero_length(input_length); + auto input_stride = get_input_val(whole_stride); + + dim3 gridDim = generate_data_gridDim(isize); + dim3 blockDim{DATA_GEN_THREADS}; + + launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generate_random_interleaved_data_kernel), + gridDim, + blockDim, + 0, // sharedMemBytes + 0, // stream + input_length, + zero_length, + idist, + isize, + input_stride, + input_data); + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +template +static void generate_interleaved_data(const Tint& whole_length, + const size_t idist, + const size_t isize, + const Tint& whole_stride, + const size_t nbatch, + rocfft_complex* input_data, + const hipDeviceProp_t& deviceProp) +{ + const auto input_length = get_input_val(whole_length); + const auto input_stride = get_input_val(whole_stride); + const auto unit_stride = make_unit_stride(input_length); + + const auto inv_scale + = static_cast(1.0) + / static_cast(static_cast(isize) / nbatch - 1); + + dim3 gridDim = generate_data_gridDim(isize); + dim3 blockDim{DATA_GEN_THREADS}; + + launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generate_interleaved_data_kernel), + gridDim, + blockDim, + 0, // sharedMemBytes + 0, // stream + input_length, + idist, + isize, + input_stride, + unit_stride, + inv_scale, + input_data); + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("generate_interleaved_data_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +template +static void generate_random_planar_data(const Tint& whole_length, + const size_t idist, + const size_t isize, + const Tint& whole_stride, + Treal* real_data, + Treal* imag_data, + const hipDeviceProp_t& deviceProp) +{ + const auto input_length = get_input_val(whole_length); + const auto zero_length = make_zero_length(input_length); + const auto input_stride = get_input_val(whole_stride); + + dim3 gridDim = generate_data_gridDim(isize); + dim3 blockDim{DATA_GEN_THREADS}; + + launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generate_random_planar_data_kernel), + gridDim, + blockDim, + 0, // sharedMemBytes + 0, // stream + input_length, + zero_length, + idist, + isize, + input_stride, + real_data, + imag_data); + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("generate_random_planar_data_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +template +static void generate_planar_data(const Tint& whole_length, + const size_t idist, + const size_t isize, + const Tint& whole_stride, + const size_t nbatch, + Treal* real_data, + Treal* imag_data, + const hipDeviceProp_t& deviceProp) +{ + const auto input_length = get_input_val(whole_length); + const auto input_stride = get_input_val(whole_stride); + const auto unit_stride = make_unit_stride(input_length); + + const auto inv_scale + = static_cast(1.0) + / static_cast(static_cast(isize) / nbatch - 1); + + dim3 gridDim = generate_data_gridDim(isize); + dim3 blockDim{DATA_GEN_THREADS}; + + launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel), + gridDim, + blockDim, + 0, // sharedMemBytes + 0, // stream + input_length, + idist, + isize, + input_stride, + unit_stride, + inv_scale, + real_data, + imag_data); + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("generate_planar_data_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +template +static void generate_random_real_data(const Tint& whole_length, + const size_t idist, + const size_t isize, + const Tint& whole_stride, + Treal* input_data, + const hipDeviceProp_t& deviceProp) +{ + const auto input_length = get_input_val(whole_length); + const auto zero_length = make_zero_length(input_length); + const auto input_stride = get_input_val(whole_stride); + + dim3 gridDim = generate_data_gridDim(isize); + dim3 blockDim{DATA_GEN_THREADS}; + + launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generate_random_real_data_kernel), + gridDim, + blockDim, + 0, // sharedMemBytes + 0, // stream + input_length, + zero_length, + idist, + isize, + input_stride, + input_data); + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("generate_random_real_data_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +template +static void generate_real_data(const Tint& whole_length, + const size_t idist, + const size_t isize, + const Tint& whole_stride, + const size_t nbatch, + Treal* input_data, + const hipDeviceProp_t& deviceProp) +{ + const auto input_length = get_input_val(whole_length); + const auto input_stride = get_input_val(whole_stride); + const auto unit_stride = make_unit_stride(input_length); + + const auto inv_scale + = static_cast(1.0) + / static_cast(static_cast(isize) / nbatch - 1); + + dim3 gridDim = generate_data_gridDim(isize); + dim3 blockDim{DATA_GEN_THREADS}; + + launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel), + gridDim, + blockDim, + 0, // sharedMemBytes + 0, // stream + input_length, + idist, + isize, + input_stride, + unit_stride, + inv_scale, + input_data); + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("generate_real_data_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +template +static void impose_hermitian_symmetry_interleaved(const std::vector& length, + const std::vector& ilength, + const std::vector& stride, + const size_t dist, + const size_t batch, + Tcomplex* input_data, + const hipDeviceProp_t& deviceProp) +{ + auto blockSize = DATA_GEN_THREADS; + auto blockDim = generate_blockDim(length, blockSize); + auto gridDim = generate_hermitian_gridDim(length, batch, blockSize); + + switch(length.size()) + { + case 1: + { + launch_limits_check( + "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel, + gridDim, + blockDim, + 0, + 0, + input_data, + length[0], + stride[0], + dist, + batch, + length[0] % 2 == 0); + + break; + } + case 2: + { + launch_limits_check( + "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel, + gridDim, + blockDim, + 0, + 0, + input_data, + length[0], + length[1], + stride[0], + stride[1], + dist, + batch, + (ilength[0] + 1) / 2 - 1, + length[0] % 2 == 0, + length[1] % 2 == 0); + + break; + } + case 3: + { + launch_limits_check( + "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel, + gridDim, + blockDim, + 0, + 0, + input_data, + length[0], + length[1], + length[2], + stride[0], + stride[1], + stride[2], + dist, + batch, + (ilength[0] + 1) / 2 - 1, + ilength[1] - 1, + (ilength[1] + 1) / 2 - 1, + length[0] % 2 == 0, + length[1] % 2 == 0, + length[2] % 2 == 0); + break; + } + default: + throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); + } + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +template +static void impose_hermitian_symmetry_planar(const std::vector& length, + const std::vector& ilength, + const std::vector& stride, + const size_t dist, + const size_t batch, + Tfloat* input_data_real, + Tfloat* input_data_imag, + const hipDeviceProp_t& deviceProp) +{ + auto blockSize = DATA_GEN_THREADS; + auto blockDim = generate_blockDim(length, blockSize); + auto gridDim = generate_hermitian_gridDim(length, batch, blockSize); + + switch(length.size()) + { + case 1: + { + launch_limits_check( + "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel, + gridDim, + blockDim, + 0, + 0, + input_data_real, + input_data_imag, + length[0], + stride[0], + dist, + batch, + length[0] % 2 == 0); + + break; + } + case 2: + { + launch_limits_check( + "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel, + gridDim, + blockDim, + 0, + 0, + input_data_real, + input_data_imag, + length[0], + length[1], + stride[0], + stride[1], + dist, + batch, + (ilength[0] + 1) / 2 - 1, + length[0] % 2 == 0, + length[1] % 2 == 0); + + break; + } + case 3: + { + launch_limits_check( + "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp); + + hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel, + gridDim, + blockDim, + 0, + 0, + input_data_real, + input_data_imag, + length[0], + length[1], + length[2], + stride[0], + stride[1], + stride[2], + dist, + batch, + (ilength[0] + 1) / 2 - 1, + ilength[1] - 1, + (ilength[1] + 1) / 2 - 1, + length[0] % 2 == 0, + length[1] % 2 == 0, + length[2] % 2 == 0); + break; + } + default: + throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); + } + auto err = hipGetLastError(); + if(err != hipSuccess) + throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: " + + std::string(hipGetErrorName(err))); +} + +#endif // DATA_GEN_DEVICE_H diff --git a/shared/data_gen_host.h b/shared/data_gen_host.h new file mode 100644 index 0000000..29d3854 --- /dev/null +++ b/shared/data_gen_host.h @@ -0,0 +1,881 @@ +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef DATA_GEN_HOST_H +#define DATA_GEN_HOST_H + +#include "../shared/hostbuf.h" +#include "../shared/increment.h" +#include +#include +#include +#include +#include + +// Specialized computation of index given 1-, 2-, 3- dimension length + stride +template +size_t compute_index(T1 length, T2 stride, size_t base) +{ + return (length * stride) + base; +} + +template +size_t + compute_index(const std::tuple& length, const std::tuple& stride, size_t base) +{ + static_assert(std::is_integral::value, "Integral required."); + static_assert(std::is_integral::value, "Integral required."); + return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) + + base; +} + +template +size_t compute_index(const std::tuple& length, + const std::tuple& stride, + size_t base) +{ + static_assert(std::is_integral::value, "Integral required."); + static_assert(std::is_integral::value, "Integral required."); + return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) + + (std::get<2>(length) * std::get<2>(stride)) + base; +} + +// count the number of total iterations for 1-, 2-, and 3-D dimensions +template +size_t count_iters(const T1& i) +{ + return i; +} + +template +size_t count_iters(const std::tuple& i) +{ + return std::get<0>(i) * std::get<1>(i); +} + +template +size_t count_iters(const std::tuple& i) +{ + return std::get<0>(i) * std::get<1>(i) * std::get<2>(i); +} + +template +T1 make_unit_stride(const T1& whole_length) +{ + return static_cast(1); +} + +template +std::tuple make_unit_stride(const std::tuple& whole_length) +{ + return std::make_tuple(static_cast(1), static_cast(std::get<0>(whole_length))); +} + +template +std::tuple make_unit_stride(const std::tuple& whole_length) +{ + return std::make_tuple(static_cast(1), + static_cast(std::get<0>(whole_length)), + static_cast(std::get<0>(whole_length)) + * static_cast(std::get<1>(whole_length))); +} + +// Work out how many partitions to break our iteration problem into +template +static size_t compute_partition_count(T1 length) +{ +#ifdef _OPENMP + // we seem to get contention from too many threads, which slows + // things down. particularly noticeable with mix_3D tests + static const size_t MAX_PARTITIONS = 8; + size_t iters = count_iters(length); + size_t hw_threads = std::min(MAX_PARTITIONS, static_cast(omp_get_num_procs())); + if(!hw_threads) + return 1; + + // don't bother threading problem sizes that are too small. pick + // an arbitrary number of iterations and ensure that each thread + // has at least that many iterations to process + static const size_t MIN_ITERS_PER_THREAD = 2048; + + // either use the whole CPU, or use ceil(iters/iters_per_thread) + return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD); +#else + return 1; +#endif +} + +// Break a scalar length into some number of pieces, returning +// [(start0, end0), (start1, end1), ...] +template +std::vector> partition_base(const T1& length, size_t num_parts) +{ + static_assert(std::is_integral::value, "Integral required."); + + // make sure we don't exceed the length + num_parts = std::min(length, num_parts); + + std::vector> ret(num_parts); + auto partition_size = length / num_parts; + T1 cur_partition = 0; + for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size) + { + ret[i].first = cur_partition; + ret[i].second = cur_partition + partition_size; + } + // last partition might not divide evenly, fix it up + ret.back().second = length; + return ret; +} + +// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths +template +std::vector> partition_rowmajor(const T1& length) +{ + return partition_base(length, compute_partition_count(length)); +} + +// Partition on the leftmost part of the tuple, for row-major indexing +template +std::vector, std::tuple>> + partition_rowmajor(const std::tuple& length) +{ + auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); + std::vector, std::tuple>> ret(partitions.size()); + for(size_t i = 0; i < partitions.size(); ++i) + { + std::get<0>(ret[i].first) = partitions[i].first; + std::get<1>(ret[i].first) = 0; + std::get<0>(ret[i].second) = partitions[i].second; + std::get<1>(ret[i].second) = std::get<1>(length); + } + return ret; +} +template +std::vector, std::tuple>> + partition_rowmajor(const std::tuple& length) +{ + auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); + std::vector, std::tuple>> ret(partitions.size()); + for(size_t i = 0; i < partitions.size(); ++i) + { + std::get<0>(ret[i].first) = partitions[i].first; + std::get<1>(ret[i].first) = 0; + std::get<2>(ret[i].first) = 0; + std::get<0>(ret[i].second) = partitions[i].second; + std::get<1>(ret[i].second) = std::get<1>(length); + std::get<2>(ret[i].second) = std::get<2>(length); + } + return ret; +} + +// For complex-to-real transforms, the input data must be Hermitiam-symmetric. +// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier +// space. For multi-dimensional data, this means that we only need to store a bit more +// than half of the complex values; the rest are redundant. However, there are still +// some restrictions: +// * the origin and Nyquist value(s) must be real-valued +// * some of the remaining values are still redundant, and you might get different results +// than you expect if the values don't agree. +// Below are some example kernels which impose Hermitian symmetry on a complex array +// of the given dimensions. + +template +static void impose_hermitian_symmetry_interleaved_1D(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) + { + auto data = ((std::complex*)vals[0].data()) + ibatch * idist; + + data[0].imag(0.0); + + if(length[0] % 2 == 0) + { + data[istride[0] * (length[0] / 2)].imag(0.0); + } + } +} + +template +static void impose_hermitian_symmetry_planar_1D(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) + { + auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; + + data_imag[0] = 0.0; + + if(length[0] % 2 == 0) + { + data_imag[istride[0] * (length[0] / 2)] = 0.0; + } + } +} + +template +static void impose_hermitian_symmetry_interleaved_2D(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) + { + auto data = ((std::complex*)vals[0].data()) + ibatch * idist; + + data[0].imag(0.0); + + if(length[0] % 2 == 0) + { + data[istride[0] * (length[0] / 2)].imag(0.0); + } + + if(length[1] % 2 == 0) + { + data[istride[1] * (length[1] / 2)].imag(0.0); + } + + if(length[0] % 2 == 0 && length[1] % 2 == 0) + { + data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0); + } + + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]); + } + + if(length[1] % 2 == 0) + { + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] + = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]); + } + } + } +} + +template +static void impose_hermitian_symmetry_planar_2D(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) + { + auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist; + auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; + + data_imag[0] = 0.0; + + if(length[0] % 2 == 0) + { + data_imag[istride[0] * (length[0] / 2)] = 0.0; + } + + if(length[1] % 2 == 0) + { + data_imag[istride[1] * (length[1] / 2)] = 0.0; + } + + if(length[0] % 2 == 0 && length[1] % 2 == 0) + { + data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0; + } + + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i]; + data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i]; + } + + if(length[1] % 2 == 0) + { + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] + = data_real[istride[0] * i + istride[1] * (length[1] / 2)]; + data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] + = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)]; + } + } + } +} + +template +static void impose_hermitian_symmetry_interleaved_3D(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) + { + auto data = ((std::complex*)vals[0].data()) + ibatch * idist; + + data[0].imag(0.0); + + if(length[0] % 2 == 0) + { + data[istride[0] * (length[0] / 2)].imag(0.0); + } + + if(length[1] % 2 == 0) + { + data[istride[1] * (length[1] / 2)].imag(0.0); + } + + if(length[2] % 2 == 0) + { + data[istride[2] * (length[2] / 2)].imag(0.0); + } + + if(length[0] % 2 == 0 && length[1] % 2 == 0) + { + data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0); + } + + if(length[0] % 2 == 0 && length[2] % 2 == 0) + { + data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0); + } + if(length[1] % 2 == 0 && length[2] % 2 == 0) + { + data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0); + } + + if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0) + { + data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2) + + istride[2] * (length[2] / 2)] + .imag(0.0); + } + + // y-axis: + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]); + } + + if(length[0] % 2 == 0) + { + // y-axis at x-nyquist + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] + = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]); + } + } + + // x-axis: + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]); + } + + if(length[1] % 2 == 0) + { + // x-axis at y-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] + = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]); + } + } + + // x-y plane: + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + for(unsigned int j = 1; j < length[1]; ++j) + { + data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] + = std::conj(data[istride[0] * i + istride[1] * j]); + } + } + + if(length[2] % 2 == 0) + { + // x-axis at z-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] + = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]); + } + if(length[1] % 2 == 0) + { + // x-axis at yz-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] + = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]); + } + } + + // y-axis: at z-nyquist + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] + = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]); + } + + if(length[0] % 2 == 0) + { + // y-axis: at xz-nyquist + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) + + istride[2] * (length[2] / 2)] + = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j + + istride[2] * (length[2] / 2)]); + } + } + + // x-y plane: at z-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + for(unsigned int j = 1; j < length[1]; ++j) + { + data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) + + istride[2] * (length[2] / 2)] + = std::conj( + data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]); + } + } + } + } +} + +template +static void impose_hermitian_symmetry_planar_3D(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) + { + auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist; + auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; + + data_imag[0] = 0.0; + + if(length[0] % 2 == 0) + { + data_imag[istride[0] * (length[0] / 2)] = 0.0; + } + + if(length[1] % 2 == 0) + { + data_imag[istride[1] * (length[1] / 2)] = 0.0; + } + + if(length[2] % 2 == 0) + { + data_imag[istride[2] * (length[2] / 2)] = 0.0; + } + + if(length[0] % 2 == 0 && length[1] % 2 == 0) + { + data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0; + } + + if(length[0] % 2 == 0 && length[2] % 2 == 0) + { + data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0; + } + if(length[1] % 2 == 0 && length[2] % 2 == 0) + { + data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0; + } + + if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0) + { + data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2) + + istride[2] * (length[2] / 2)] + = 0.0; + } + + // y-axis: + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j]; + data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j]; + } + + if(length[0] % 2 == 0) + { + // y-axis at x-nyquist + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] + = data_real[istride[0] * (length[0] / 2) + istride[1] * j]; + data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] + = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j]; + } + } + + // x-axis: + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i]; + data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i]; + } + + if(length[1] % 2 == 0) + { + // x-axis at y-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] + = data_real[istride[0] * i + istride[1] * (length[1] / 2)]; + data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] + = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)]; + } + } + + // x-y plane: + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + for(unsigned int j = 1; j < length[1]; ++j) + { + data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] + = data_real[istride[0] * i + istride[1] * j]; + data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] + = -data_imag[istride[0] * i + istride[1] * j]; + } + } + + if(length[2] % 2 == 0) + { + // x-axis at z-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] + = data_real[istride[0] * i + istride[2] * (length[2] / 2)]; + data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] + = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)]; + } + if(length[1] % 2 == 0) + { + // x-axis at yz-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] + = data_real[istride[0] * i + istride[2] * (length[2] / 2)]; + data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] + = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)]; + } + } + + // y-axis: at z-nyquist + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] + = data_real[istride[1] * j + istride[2] * (length[2] / 2)]; + data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] + = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)]; + } + + if(length[0] % 2 == 0) + { + // y-axis: at xz-nyquist + for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) + { + data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) + + istride[2] * (length[2] / 2)] + = data_real[istride[0] * (length[0] / 2) + istride[1] * j + + istride[2] * (length[2] / 2)]; + data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) + + istride[2] * (length[2] / 2)] + = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j + + istride[2] * (length[2] / 2)]; + } + } + + // x-y plane: at z-nyquist + for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) + { + for(unsigned int j = 1; j < length[1]; ++j) + { + data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) + + istride[2] * (length[2] / 2)] + = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]; + data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) + + istride[2] * (length[2] / 2)] + = -data_imag[istride[0] * i + istride[1] * j + + istride[2] * (length[2] / 2)]; + } + } + } + } +} + +template +static void generate_random_interleaved_data(std::vector& input, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch) +{ + auto idata = (std::complex*)input[0].data(); + size_t i_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(unsigned int b = 0; b < nbatch; b++, i_base += idist) + { +#pragma omp parallel for num_threads(partitions.size()) + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + std::mt19937 gen(compute_index(index, whole_stride, i_base)); + do + { + const auto i = compute_index(index, whole_stride, i_base); + const Tfloat x = (Tfloat)gen() / (Tfloat)gen.max(); + const Tfloat y = (Tfloat)gen() / (Tfloat)gen.max(); + const std::complex val(x, y); + idata[i] = val; + } while(increment_rowmajor(index, length)); + } + } +} + +template +static void generate_interleaved_data(std::vector& input, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch) +{ + auto idata = (std::complex*)input[0].data(); + size_t i_base = 0; + auto partitions = partition_rowmajor(whole_length); + auto unit_stride = make_unit_stride(whole_length); + + const Tfloat inv_scale = 1.0 / static_cast(count_iters(whole_length) - 1); + + for(unsigned int b = 0; b < nbatch; b++, i_base += idist) + { +#pragma omp parallel for num_threads(partitions.size()) + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto val_xy + = -0.5 + static_cast(compute_index(index, unit_stride, 0)) * inv_scale; + + const std::complex val(val_xy, val_xy); + + const auto i = compute_index(index, whole_stride, i_base); + + idata[i] = val; + } while(increment_rowmajor(index, length)); + } + } +} + +template +static void generate_random_planar_data(std::vector& input, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch) +{ + auto ireal = (Tfloat*)input[0].data(); + auto iimag = (Tfloat*)input[1].data(); + size_t i_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(unsigned int b = 0; b < nbatch; b++, i_base += idist) + { +#pragma omp parallel for num_threads(partitions.size()) + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + std::mt19937 gen(compute_index(index, whole_stride, i_base)); + do + { + const auto i = compute_index(index, whole_stride, i_base); + const std::complex val((Tfloat)gen() / (Tfloat)gen.max(), + (Tfloat)gen() / (Tfloat)gen.max()); + ireal[i] = val.real(); + iimag[i] = val.imag(); + } while(increment_rowmajor(index, length)); + } + } +} + +template +static void generate_planar_data(std::vector& input, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch) +{ + + auto ireal = (Tfloat*)input[0].data(); + auto iimag = (Tfloat*)input[1].data(); + size_t i_base = 0; + auto partitions = partition_rowmajor(whole_length); + auto unit_stride = make_unit_stride(whole_length); + + const Tfloat inv_scale = 1.0 / static_cast(count_iters(whole_length) - 1); + + for(unsigned int b = 0; b < nbatch; b++, i_base += idist) + { +#pragma omp parallel for num_threads(partitions.size()) + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto val_xy + = -0.5 + static_cast(compute_index(index, unit_stride, 0)) * inv_scale; + + const auto i = compute_index(index, whole_stride, i_base); + + ireal[i] = val_xy; + iimag[i] = val_xy; + } while(increment_rowmajor(index, length)); + } + } +} + +template +static void generate_random_real_data(std::vector& input, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch) +{ + auto idata = (Tfloat*)input[0].data(); + size_t i_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(unsigned int b = 0; b < nbatch; b++, i_base += idist) + { +#pragma omp parallel for num_threads(partitions.size()) + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + std::mt19937 gen(compute_index(index, whole_stride, i_base)); + do + { + const auto i = compute_index(index, whole_stride, i_base); + const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max(); + idata[i] = val; + } while(increment_rowmajor(index, length)); + } + } +} + +template +static void generate_real_data(std::vector& input, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch) +{ + + auto idata = (Tfloat*)input[0].data(); + size_t i_base = 0; + auto partitions = partition_rowmajor(whole_length); + auto unit_stride = make_unit_stride(whole_length); + + const Tfloat inv_scale = 1.0 / static_cast(count_iters(whole_length) - 1); + + for(unsigned int b = 0; b < nbatch; b++, i_base += idist) + { +#pragma omp parallel for num_threads(partitions.size()) + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto i = compute_index(index, whole_stride, i_base); + + idata[i] + = -0.5 + static_cast(compute_index(index, unit_stride, 0)) * inv_scale; + } while(increment_rowmajor(index, length)); + } + } +} + +template +static void impose_hermitian_symmetry_interleaved(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + switch(length.size()) + { + case 1: + impose_hermitian_symmetry_interleaved_1D(vals, length, istride, idist, nbatch); + break; + case 2: + impose_hermitian_symmetry_interleaved_2D(vals, length, istride, idist, nbatch); + break; + case 3: + impose_hermitian_symmetry_interleaved_3D(vals, length, istride, idist, nbatch); + break; + default: + throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); + } +} + +template +static void impose_hermitian_symmetry_planar(std::vector& vals, + const std::vector& length, + const std::vector& istride, + const Tsize idist, + const Tsize nbatch) +{ + switch(length.size()) + { + case 1: + impose_hermitian_symmetry_planar_1D(vals, length, istride, idist, nbatch); + break; + case 2: + impose_hermitian_symmetry_planar_2D(vals, length, istride, idist, nbatch); + break; + case 3: + impose_hermitian_symmetry_planar_3D(vals, length, istride, idist, nbatch); + break; + default: + throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); + } +} + +#endif // DATA_GEN_HOST_H diff --git a/shared/device_properties.h b/shared/device_properties.h new file mode 100644 index 0000000..6e2e1e1 --- /dev/null +++ b/shared/device_properties.h @@ -0,0 +1,74 @@ +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_DEVICE_PROPS_H +#define ROCFFT_DEVICE_PROPS_H + +#include +#include +#include + +// get device properties +static hipDeviceProp_t get_curr_device_prop() +{ + hipDeviceProp_t prop; + int deviceId = 0; + if(hipGetDevice(&deviceId) != hipSuccess) + throw std::runtime_error("hipGetDevice failed."); + + if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess) + throw std::runtime_error("hipGetDeviceProperties failed for deviceId " + + std::to_string(deviceId)); + + return prop; +} + +// check that the given grid/block dims will fit into the limits in +// the device properties. throws std::runtime_error if the limits +// are exceeded. +static void launch_limits_check(const std::string& kernel_name, + const dim3 gridDim, + const dim3 blockDim, + const hipDeviceProp_t& deviceProp) +{ + // Need lots of casting here because dim3 is unsigned but device + // props are signed. Cast direct comparisons to fix signedness + // issues. Promote types to 64-bit when multiplying to try to + // avoid overflow. + + // Block limits along each dimension + if(blockDim.x > static_cast(deviceProp.maxThreadsDim[0]) + || blockDim.y > static_cast(deviceProp.maxThreadsDim[1]) + || blockDim.z > static_cast(deviceProp.maxThreadsDim[2])) + throw std::runtime_error("max threads per dim exceeded: " + kernel_name); + + // Total threads for the whole block + if(static_cast(blockDim.x) * blockDim.y * blockDim.z + > static_cast(deviceProp.maxThreadsPerBlock)) + throw std::runtime_error("max threads per block exceeded: " + kernel_name); + + // Grid dimension limits + if(gridDim.x > static_cast(deviceProp.maxGridSize[0]) + || gridDim.y > static_cast(deviceProp.maxGridSize[1]) + || gridDim.z > static_cast(deviceProp.maxGridSize[2])) + throw std::runtime_error("max grid size exceeded: " + kernel_name); +} + +#endif diff --git a/shared/enum_to_string.h b/shared/enum_to_string.h new file mode 100644 index 0000000..1c2fba0 --- /dev/null +++ b/shared/enum_to_string.h @@ -0,0 +1,81 @@ +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ENUM_TO_STRING_H +#define ENUM_TO_STRING_H + +#include "fft_params.h" + +// Return the string of the hipError code. +static std::string hipError_to_string(const hipError_t ret) +{ + switch(ret) + { + case hipSuccess: + return "hipSuccess"; + case hipErrorInvalidContext: + return "hipErrorInvalidContext"; + case hipErrorInvalidKernelFile: + return "hipErrorInvalidKernelFile"; + case hipErrorMemoryAllocation: + return "hipErrorMemoryAllocation"; + case hipErrorInitializationError: + return "hipErrorInitializationError"; + case hipErrorLaunchFailure: + return "hipErrorLaunchFailure"; + case hipErrorLaunchOutOfResources: + return "hipErrorLaunchOutOfResources"; + case hipErrorInvalidDevice: + return "hipErrorInvalidDevice"; + case hipErrorInvalidValue: + return "hipErrorInvalidValue"; + case hipErrorInvalidDevicePointer: + return "hipErrorInvalidDevicePointer"; + case hipErrorInvalidMemcpyDirection: + return "hipErrorInvalidMemcpyDirection"; + case hipErrorUnknown: + return "hipErrorUnknown"; + case hipErrorInvalidResourceHandle: + return "hipErrorInvalidResourceHandle"; + case hipErrorNotReady: + return "hipErrorNotReady"; + case hipErrorNoDevice: + return "hipErrorNoDevice"; + case hipErrorPeerAccessAlreadyEnabled: + return "hipErrorPeerAccessAlreadyEnabled"; + case hipErrorPeerAccessNotEnabled: + return "hipErrorPeerAccessNotEnabled"; + case hipErrorRuntimeMemory: + return "hipErrorRuntimeMemory"; + case hipErrorRuntimeOther: + return "hipErrorRuntimeOther"; + case hipErrorHostMemoryAlreadyRegistered: + return "hipErrorHostMemoryAlreadyRegistered"; + case hipErrorHostMemoryNotRegistered: + return "hipErrorHostMemoryNotRegistered"; + case hipErrorMapBufferObjectFailed: + return "hipErrorMapBufferObjectFailed"; + case hipErrorTbd: + return "hipErrorTbd"; + default: + throw std::runtime_error("unknown hipError"); + } +} +#endif diff --git a/shared/environment.h b/shared/environment.h new file mode 100644 index 0000000..7be56a0 --- /dev/null +++ b/shared/environment.h @@ -0,0 +1,97 @@ +// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +// wrappers around environment variable routines + +#pragma once + +#include + +// Windows provides "getenv" and "_putenv", but those modify the +// runtime's copy of the environment. The actual environment in the +// process control block is accessed using GetEnvironmentVariable and +// SetEnvironmentVariable. + +#ifdef WIN32 +#include +static void rocfft_setenv(const char* var, const char* value) +{ + SetEnvironmentVariable(var, value); +} +static void rocfft_unsetenv(const char* var) +{ + SetEnvironmentVariable(var, nullptr); +} +static std::string rocfft_getenv(const char* var) +{ + DWORD size = GetEnvironmentVariable(var, nullptr, 0); + std::string ret; + if(size) + { + ret.resize(size); + GetEnvironmentVariable(var, ret.data(), size); + // GetEnvironmentVariable counts the terminating null, so remove it + while(!ret.empty() && ret.back() == 0) + ret.pop_back(); + } + return ret; +} + +#else + +#include + +static void rocfft_setenv(const char* var, const char* value) +{ + setenv(var, value, 1); +} +static void rocfft_unsetenv(const char* var) +{ + unsetenv(var); +} +static std::string rocfft_getenv(const char* var) +{ + auto value = getenv(var); + return value ? value : ""; +} +#endif + +// RAII object to set an environment variable and restore it to its +// previous value on destruction +struct EnvironmentSetTemp +{ + EnvironmentSetTemp(const char* _var, const char* val) + : var(_var) + { + auto val_ptr = rocfft_getenv(_var); + if(!val_ptr.empty()) + oldvalue = val_ptr; + rocfft_setenv(_var, val); + } + ~EnvironmentSetTemp() + { + if(oldvalue.empty()) + rocfft_unsetenv(var.c_str()); + else + rocfft_setenv(var.c_str(), oldvalue.c_str()); + } + std::string var; + std::string oldvalue; +}; diff --git a/shared/fft_params.h b/shared/fft_params.h new file mode 100644 index 0000000..bf428ef --- /dev/null +++ b/shared/fft_params.h @@ -0,0 +1,3274 @@ +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFT_PARAMS_H +#define FFT_PARAMS_H + +#include +#include +#include +#include +#include +#include +#ifdef _OPENMP +#include +#endif +#include +#include +#include +#include + +#include "../shared/arithmetic.h" +#include "../shared/array_validator.h" +#include "../shared/data_gen_device.h" +#include "../shared/data_gen_host.h" +#include "../shared/device_properties.h" +#include "../shared/printbuffer.h" +#include "../shared/ptrdiff.h" + +enum fft_status +{ + fft_status_success, + fft_status_failure, + fft_status_invalid_arg_value, + fft_status_invalid_dimensions, + fft_status_invalid_array_type, + fft_status_invalid_strides, + fft_status_invalid_distance, + fft_status_invalid_offset, + fft_status_invalid_work_buffer, +}; + +enum fft_transform_type +{ + fft_transform_type_complex_forward, + fft_transform_type_complex_inverse, + fft_transform_type_real_forward, + fft_transform_type_real_inverse, +}; + +enum fft_precision +{ + fft_precision_half, + fft_precision_single, + fft_precision_double, +}; + +static std::istream& operator>>(std::istream& str, fft_precision& precision) +{ + std::string word; + str >> word; + + if(word == "half") + precision = fft_precision_half; + else if(word == "single") + precision = fft_precision_single; + else if(word == "double") + precision = fft_precision_double; + else + throw std::runtime_error("Invalid precision specified"); + return str; +} + +// fft_input_generator: linearly spaced sequence in [-0.5,0.5] +// fft_input_random_generator: pseudo-random sequence in [-0.5,0.5] +enum fft_input_generator +{ + fft_input_random_generator_device, + fft_input_random_generator_host, + fft_input_generator_device, + fft_input_generator_host, +}; + +static std::istream& operator>>(std::istream& str, fft_input_generator& gen) +{ + std::string word; + str >> word; + + if(word == "0") + gen = fft_input_random_generator_device; + else if(word == "1") + gen = fft_input_random_generator_host; + else if(word == "2") + gen = fft_input_generator_device; + else if(word == "3") + gen = fft_input_generator_host; + else + throw std::runtime_error("Invalid input generator specified"); + return str; +} + +enum fft_array_type +{ + fft_array_type_complex_interleaved, + fft_array_type_complex_planar, + fft_array_type_real, + fft_array_type_hermitian_interleaved, + fft_array_type_hermitian_planar, + fft_array_type_unset, +}; + +enum fft_result_placement +{ + fft_placement_inplace, + fft_placement_notinplace, +}; + +// Determine the size of the data type given the precision and type. +template +inline Tsize var_size(const fft_precision precision, const fft_array_type type) +{ + size_t var_size = 0; + switch(precision) + { + case fft_precision_half: + var_size = sizeof(_Float16); + break; + case fft_precision_single: + var_size = sizeof(float); + break; + case fft_precision_double: + var_size = sizeof(double); + break; + } + switch(type) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + var_size *= 2; + break; + default: + break; + } + return var_size; +} +// Given an array type and transform length, strides, etc, load random floats in [0,1] +// into the input array of floats/doubles or complex floats/doubles gpu buffers. +template +inline void set_input(std::vector& input, + const fft_input_generator igen, + const fft_array_type itype, + const std::vector& length, + const std::vector& ilength, + const std::vector& istride, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch, + const hipDeviceProp_t& deviceProp) +{ + auto isize = count_iters(whole_length) * nbatch; + + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + { + auto ibuffer = (rocfft_complex*)input[0].data(); + + if(igen == fft_input_generator_device) + generate_interleaved_data( + whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp); + else if(igen == fft_input_random_generator_device) + generate_random_interleaved_data( + whole_length, idist, isize, whole_stride, ibuffer, deviceProp); + + if(itype == fft_array_type_hermitian_interleaved) + { + auto ibuffer_2 = (rocfft_complex*)input[0].data(); + impose_hermitian_symmetry_interleaved( + length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp); + } + + break; + } + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + { + auto ibuffer_real = (Tfloat*)input[0].data(); + auto ibuffer_imag = (Tfloat*)input[1].data(); + + if(igen == fft_input_generator_device) + generate_planar_data(whole_length, + idist, + isize, + whole_stride, + nbatch, + ibuffer_real, + ibuffer_imag, + deviceProp); + else if(igen == fft_input_random_generator_device) + generate_random_planar_data( + whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp); + + if(itype == fft_array_type_hermitian_planar) + impose_hermitian_symmetry_planar( + length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp); + + break; + } + case fft_array_type_real: + { + auto ibuffer = (Tfloat*)input[0].data(); + + if(igen == fft_input_generator_device) + generate_real_data( + whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp); + else if(igen == fft_input_random_generator_device) + generate_random_real_data( + whole_length, idist, isize, whole_stride, ibuffer, deviceProp); + + break; + } + default: + throw std::runtime_error("Input layout format not yet supported"); + } +} + +template +inline void set_input(std::vector& input, + const fft_input_generator igen, + const fft_array_type itype, + const std::vector& length, + const std::vector& ilength, + const std::vector& istride, + const Tint1& whole_length, + const Tint1& whole_stride, + const size_t idist, + const size_t nbatch, + const hipDeviceProp_t& deviceProp) +{ + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + { + if(igen == fft_input_generator_host) + generate_interleaved_data(input, whole_length, whole_stride, idist, nbatch); + else if(igen == fft_input_random_generator_host) + generate_random_interleaved_data( + input, whole_length, whole_stride, idist, nbatch); + + if(itype == fft_array_type_hermitian_interleaved) + impose_hermitian_symmetry_interleaved(input, length, istride, idist, nbatch); + + break; + } + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + { + if(igen == fft_input_generator_host) + generate_planar_data(input, whole_length, whole_stride, idist, nbatch); + else if(igen == fft_input_random_generator_host) + generate_random_planar_data(input, whole_length, whole_stride, idist, nbatch); + + if(itype == fft_array_type_hermitian_planar) + impose_hermitian_symmetry_planar(input, length, istride, idist, nbatch); + + break; + } + case fft_array_type_real: + { + if(igen == fft_input_generator_host) + generate_real_data(input, whole_length, whole_stride, idist, nbatch); + else if(igen == fft_input_random_generator_host) + generate_random_real_data(input, whole_length, whole_stride, idist, nbatch); + + break; + } + default: + throw std::runtime_error("Input layout format not yet supported"); + } +} + +// unroll set_input for dimension 1, 2, 3 +template +inline void set_input(std::vector& input, + const fft_input_generator igen, + const fft_array_type itype, + const std::vector& length, + const std::vector& ilength, + const std::vector& istride, + const size_t idist, + const size_t nbatch, + const hipDeviceProp_t& deviceProp) +{ + switch(length.size()) + { + case 1: + set_input(input, + igen, + itype, + length, + ilength, + istride, + ilength[0], + istride[0], + idist, + nbatch, + deviceProp); + break; + case 2: + set_input(input, + igen, + itype, + length, + ilength, + istride, + std::make_tuple(ilength[0], ilength[1]), + std::make_tuple(istride[0], istride[1]), + idist, + nbatch, + deviceProp); + break; + case 3: + set_input(input, + igen, + itype, + length, + ilength, + istride, + std::make_tuple(ilength[0], ilength[1], ilength[2]), + std::make_tuple(istride[0], istride[1], istride[2]), + idist, + nbatch, + deviceProp); + break; + default: + abort(); + } +} + +// Container class for test parameters. +class fft_params +{ +public: + // All parameters are row-major. + std::vector length; + std::vector istride; + std::vector ostride; + size_t nbatch = 1; + fft_precision precision = fft_precision_single; + fft_input_generator igen = fft_input_random_generator_device; + fft_transform_type transform_type = fft_transform_type_complex_forward; + fft_result_placement placement = fft_placement_inplace; + size_t idist = 0; + size_t odist = 0; + fft_array_type itype = fft_array_type_unset; + fft_array_type otype = fft_array_type_unset; + std::vector ioffset = {0, 0}; + std::vector ooffset = {0, 0}; + + std::vector isize; + std::vector osize; + + size_t workbuffersize = 0; + + struct fft_brick + { + // all vectors here are row-major, with same length as FFT + // dimension + 1 (for batch dimension) + + // inclusive lower bound of brick + std::vector lower; + // exclusive upper bound of brick + std::vector upper; + // stride of brick in memory + std::vector stride; + + // compute the length of this brick + std::vector length() const + { + std::vector ret; + for(size_t i = 0; i < lower.size(); ++i) + ret.push_back(upper[i] - lower[i]); + return ret; + } + + // compute offset of lower bound in a field with the given + // stride + dist (batch stride is separate) + size_t lower_field_offset(std::vector stride, size_t dist) const + { + // brick strides include batch, so adjust our input accordingly + stride.insert(stride.begin(), dist); + + return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0); + } + + // location of the brick + int device = 0; + }; + + struct fft_field + { + std::vector bricks; + }; + // optional brick decomposition of inputs/outputs + std::vector ifields; + std::vector ofields; + + // run testing load/store callbacks + bool run_callbacks = false; + static constexpr double load_cb_scalar = 0.457813941; + static constexpr double store_cb_scalar = 0.391504938; + + // Check that data outside of output strides is not overwritten. + // This is only set explicitly on some tests where there's space + // between dimensions, but the dimensions are still in-order. + // We're not trying to generically find holes in arbitrary data + // layouts. + // + // NOTE: this flag is not included in tokens, since it doesn't + // affect how the FFT library behaves. + bool check_output_strides = false; + + // scaling factor - we do a pointwise multiplication of outputs by + // this factor + double scale_factor = 1.0; + + fft_params(){}; + virtual ~fft_params(){}; + + // Given an array type, return the name as a string. + static std::string array_type_name(const fft_array_type type, bool verbose = true) + { + switch(type) + { + case fft_array_type_complex_interleaved: + return verbose ? "fft_array_type_complex_interleaved" : "CI"; + case fft_array_type_complex_planar: + return verbose ? "fft_array_type_complex_planar" : "CP"; + case fft_array_type_real: + return verbose ? "fft_array_type_real" : "R"; + case fft_array_type_hermitian_interleaved: + return verbose ? "fft_array_type_hermitian_interleaved" : "HI"; + case fft_array_type_hermitian_planar: + return verbose ? "fft_array_type_hermitian_planar" : "HP"; + case fft_array_type_unset: + return verbose ? "fft_array_type_unset" : "UN"; + } + return ""; + } + + std::string transform_type_name() const + { + switch(transform_type) + { + case fft_transform_type_complex_forward: + return "fft_transform_type_complex_forward"; + case fft_transform_type_complex_inverse: + return "fft_transform_type_complex_inverse"; + case fft_transform_type_real_forward: + return "fft_transform_type_real_forward"; + case fft_transform_type_real_inverse: + return "fft_transform_type_real_inverse"; + default: + throw std::runtime_error("Invalid transform type"); + } + } + + // Convert to string for output. + std::string str(const std::string& separator = ", ") const + { + // top-level stride/dist are not used when fields are specified. + const bool have_ifields = !ifields.empty(); + const bool have_ofields = !ofields.empty(); + + std::stringstream ss; + auto print_size_vec = [&](const char* description, const std::vector& vec) { + ss << description << ":"; + for(auto i : vec) + ss << " " << i; + ss << separator; + }; + auto print_fields = [&](const char* description, const std::vector& fields) { + for(unsigned int fidx = 0; fidx < fields.size(); ++fidx) + { + const auto& f = fields[fidx]; + ss << description << " " << fidx << ":" << separator; + for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx) + { + const auto& b = f.bricks[bidx]; + ss << " brick " << bidx << ":" << separator; + print_size_vec(" lower", b.lower); + print_size_vec(" upper", b.upper); + print_size_vec(" stride", b.stride); + ss << " device: " << b.device << separator; + } + } + }; + + print_size_vec("length", length); + if(have_ifields) + { + print_fields("ifield", ifields); + } + else + { + print_size_vec("istride", istride); + ss << "idist: " << idist << separator; + } + + if(have_ofields) + { + print_fields("ofield", ofields); + } + else + { + print_size_vec("ostride", ostride); + ss << "odist: " << odist << separator; + } + + ss << "batch: " << nbatch << separator; + print_size_vec("isize", isize); + print_size_vec("osize", osize); + + print_size_vec("ioffset", ioffset); + print_size_vec("ooffset", ooffset); + + if(placement == fft_placement_inplace) + ss << "in-place"; + else + ss << "out-of-place"; + ss << separator; + ss << "transform_type: " << transform_type_name() << separator; + ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator; + switch(precision) + { + case fft_precision_half: + ss << "half-precision"; + break; + case fft_precision_single: + ss << "single-precision"; + break; + case fft_precision_double: + ss << "double-precision"; + break; + } + ss << separator; + + print_size_vec("ilength", ilength()); + print_size_vec("olength", olength()); + + print_size_vec("ibuffer_size", ibuffer_sizes()); + print_size_vec("obuffer_size", obuffer_sizes()); + + if(scale_factor != 1.0) + ss << "scale factor: " << scale_factor << separator; + + return ss.str(); + } + + // Produce a stringified token of the test fft params. + std::string token() const + { + std::string ret; + + switch(transform_type) + { + case fft_transform_type_complex_forward: + ret += "complex_forward_"; + break; + case fft_transform_type_complex_inverse: + ret += "complex_inverse_"; + break; + case fft_transform_type_real_forward: + ret += "real_forward_"; + break; + case fft_transform_type_real_inverse: + ret += "real_inverse_"; + break; + } + + auto append_size_vec = [&ret](const std::vector& vec) { + for(auto s : vec) + { + ret += "_"; + ret += std::to_string(s); + } + }; + + ret += "len"; + append_size_vec(length); + + switch(precision) + { + case fft_precision_half: + ret += "_half_"; + break; + case fft_precision_single: + ret += "_single_"; + break; + case fft_precision_double: + ret += "_double_"; + break; + } + + switch(placement) + { + case fft_placement_inplace: + ret += "ip_"; + break; + case fft_placement_notinplace: + ret += "op_"; + break; + } + + ret += "batch_"; + ret += std::to_string(nbatch); + + auto append_array_type = [&ret](fft_array_type type) { + switch(type) + { + case fft_array_type_complex_interleaved: + ret += "CI"; + break; + case fft_array_type_complex_planar: + ret += "CP"; + break; + case fft_array_type_real: + ret += "R"; + break; + case fft_array_type_hermitian_interleaved: + ret += "HI"; + break; + case fft_array_type_hermitian_planar: + ret += "HP"; + break; + default: + ret += "UN"; + break; + } + }; + + auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) { + ret += "_brick"; + + ret += "_lower"; + append_size_vec(b.lower); + ret += "_upper"; + append_size_vec(b.upper); + ret += "_stride"; + append_size_vec(b.stride); + ret += "_dev_"; + ret += std::to_string(b.device); + }; + + const bool have_ifields = !ifields.empty(); + const bool have_ofields = !ofields.empty(); + + if(have_ifields) + { + for(const auto& f : ifields) + { + ret += "_ifield"; + for(const auto& b : f.bricks) + append_brick_info(b); + } + } + else + { + ret += "_istride"; + append_size_vec(istride); + ret += "_"; + append_array_type(itype); + } + + if(have_ofields) + { + for(const auto& f : ofields) + { + ret += "_ofield"; + for(const auto& b : f.bricks) + append_brick_info(b); + } + } + else + { + ret += "_ostride"; + append_size_vec(ostride); + ret += "_"; + append_array_type(otype); + } + + if(!have_ifields) + { + ret += "_idist_"; + ret += std::to_string(idist); + } + if(!have_ofields) + { + ret += "_odist_"; + ret += std::to_string(odist); + } + + if(!have_ifields) + { + ret += "_ioffset"; + append_size_vec(ioffset); + } + + if(!have_ofields) + { + ret += "_ooffset"; + append_size_vec(ooffset); + } + + if(run_callbacks) + ret += "_CB"; + + if(scale_factor != 1.0) + ret += "_scale"; + + return ret; + } + + // Set all params from a stringified token. + void from_token(std::string token) + { + std::vector vals; + + std::string delimiter = "_"; + { + size_t pos = 0; + while((pos = token.find(delimiter)) != std::string::npos) + { + auto val = token.substr(0, pos); + vals.push_back(val); + token.erase(0, pos + delimiter.length()); + } + vals.push_back(token); + } + + auto size_parser + = [](const std::vector& vals, const std::string token, size_t& pos) { + if(vals[pos++] != token) + throw std::runtime_error("Unable to parse token"); + return std::stoull(vals[pos++]); + }; + + auto vector_parser + = [](const std::vector& vals, const std::string token, size_t& pos) { + if(vals[pos++] != token) + throw std::runtime_error("Unable to parse token"); + std::vector vec; + + while(pos < vals.size()) + { + if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit)) + { + vec.push_back(std::stoull(vals[pos++])); + } + else + { + break; + } + } + return vec; + }; + + auto type_parser = [](const std::string& val) { + if(val == "CI") + return fft_array_type_complex_interleaved; + else if(val == "CP") + return fft_array_type_complex_planar; + else if(val == "R") + return fft_array_type_real; + else if(val == "HI") + return fft_array_type_hermitian_interleaved; + else if(val == "HP") + return fft_array_type_hermitian_planar; + return fft_array_type_unset; + }; + + auto field_parser = [&vector_parser, &size_parser](const std::vector& vals, + size_t& pos, + std::vector& output) { + // skip over ifield/ofield word + pos++; + fft_field& f = output.emplace_back(); + while(pos < vals.size() && vals[pos] == "brick") + { + fft_brick& b = f.bricks.emplace_back(); + pos++; + b.lower = vector_parser(vals, "lower", pos); + b.upper = vector_parser(vals, "upper", pos); + b.stride = vector_parser(vals, "stride", pos); + b.device = size_parser(vals, "dev", pos); + } + }; + + size_t pos = 0; + + bool complex = vals[pos++] == "complex"; + bool forward = vals[pos++] == "forward"; + + if(complex && forward) + transform_type = fft_transform_type_complex_forward; + if(complex && !forward) + transform_type = fft_transform_type_complex_inverse; + if(!complex && forward) + transform_type = fft_transform_type_real_forward; + if(!complex && !forward) + transform_type = fft_transform_type_real_inverse; + + length = vector_parser(vals, "len", pos); + + if(vals[pos] == "half") + precision = fft_precision_half; + else if(vals[pos] == "single") + precision = fft_precision_single; + else if(vals[pos] == "double") + precision = fft_precision_double; + pos++; + + placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace; + + nbatch = size_parser(vals, "batch", pos); + + // strides, bricks etc are mixed in from here, so just keep + // looking at the next token to decide what to do + while(pos < vals.size()) + { + const auto& next_token = vals[pos]; + if(next_token == "istride") + { + istride = vector_parser(vals, "istride", pos); + itype = type_parser(vals[pos]); + pos++; + } + else if(next_token == "ostride") + { + ostride = vector_parser(vals, "ostride", pos); + otype = type_parser(vals[pos]); + pos++; + } + else if(next_token == "idist") + idist = size_parser(vals, "idist", pos); + else if(next_token == "odist") + odist = size_parser(vals, "odist", pos); + else if(next_token == "ioffset") + ioffset = vector_parser(vals, "ioffset", pos); + else if(next_token == "ooffset") + ooffset = vector_parser(vals, "ooffset", pos); + else if(next_token == "ifield") + field_parser(vals, pos, ifields); + else if(next_token == "ofield") + field_parser(vals, pos, ofields); + else + break; + } + + if(pos < vals.size() && vals[pos] == "CB") + { + run_callbacks = true; + ++pos; + } + + if(pos < vals.size() && vals[pos] == "scale") + { + // just pick some factor that's not zero or one + scale_factor = 0.1239; + ++pos; + } + } + + // Stream output operator (for gtest, etc). + friend std::ostream& operator<<(std::ostream& stream, const fft_params& params) + { + stream << params.str(); + return stream; + } + + // Dimension of the transform. + size_t dim() const + { + return length.size(); + } + + virtual std::vector ilength() const + { + auto ilength = length; + if(transform_type == fft_transform_type_real_inverse) + ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1; + return ilength; + } + + virtual std::vector olength() const + { + auto olength = length; + if(transform_type == fft_transform_type_real_forward) + olength[dim() - 1] = olength[dim() - 1] / 2 + 1; + return olength; + } + + static size_t nbuffer(const fft_array_type type) + { + switch(type) + { + case fft_array_type_real: + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + return 1; + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + return 2; + case fft_array_type_unset: + return 0; + } + return 0; + } + + // Number of input buffers + size_t nibuffer() const + { + return nbuffer(itype); + } + + // Number of output buffers + size_t nobuffer() const + { + return nbuffer(otype); + } + + void set_iotypes() + { + if(itype == fft_array_type_unset) + { + switch(transform_type) + { + case fft_transform_type_complex_forward: + case fft_transform_type_complex_inverse: + itype = fft_array_type_complex_interleaved; + break; + case fft_transform_type_real_forward: + itype = fft_array_type_real; + break; + case fft_transform_type_real_inverse: + itype = fft_array_type_hermitian_interleaved; + break; + default: + throw std::runtime_error("Invalid transform type"); + } + } + if(otype == fft_array_type_unset) + { + switch(transform_type) + { + case fft_transform_type_complex_forward: + case fft_transform_type_complex_inverse: + otype = fft_array_type_complex_interleaved; + break; + case fft_transform_type_real_forward: + otype = fft_array_type_hermitian_interleaved; + break; + case fft_transform_type_real_inverse: + otype = fft_array_type_real; + break; + default: + throw std::runtime_error("Invalid transform type"); + } + } + } + + // Check that the input and output types are consistent. + bool check_iotypes() const + { + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_complex_planar: + case fft_array_type_hermitian_interleaved: + case fft_array_type_hermitian_planar: + case fft_array_type_real: + break; + default: + throw std::runtime_error("Invalid Input array type format"); + } + + switch(otype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_complex_planar: + case fft_array_type_hermitian_interleaved: + case fft_array_type_hermitian_planar: + case fft_array_type_real: + break; + default: + throw std::runtime_error("Invalid Input array type format"); + } + + // Check that format choices are supported + if(transform_type != fft_transform_type_real_forward + && transform_type != fft_transform_type_real_inverse) + { + if(placement == fft_placement_inplace && itype != otype) + { + throw std::runtime_error( + "In-place transforms must have identical input and output types"); + } + } + + bool okformat = true; + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_complex_planar: + okformat = (otype == fft_array_type_complex_interleaved + || otype == fft_array_type_complex_planar); + break; + case fft_array_type_hermitian_interleaved: + case fft_array_type_hermitian_planar: + okformat = otype == fft_array_type_real; + break; + case fft_array_type_real: + okformat = (otype == fft_array_type_hermitian_interleaved + || otype == fft_array_type_hermitian_planar); + break; + default: + throw std::runtime_error("Invalid Input array type format"); + } + + return okformat; + } + + // Given a length vector, set the rest of the strides. + // The optional argument stride0 sets the stride for the contiguous dimension. + // The optional rcpadding argument sets the stride correctly for in-place + // multi-dimensional real/complex transforms. + // Format is row-major. + template + std::vector compute_stride(const std::vector& length, + const std::vector& stride0 = std::vector(), + const bool rcpadding = false) const + { + std::vector stride(dim()); + + size_t dimoffset = 0; + + if(stride0.size() == 0) + { + // Set the contiguous stride: + stride[dim() - 1] = 1; + dimoffset = 1; + } + else + { + // Copy the input values to the end of the stride array: + for(size_t i = 0; i < stride0.size(); ++i) + { + stride[dim() - stride0.size() + i] = stride0[i]; + } + } + + if(stride0.size() < dim()) + { + // Compute any remaining values via recursion. + for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;) + { + auto lengthip1 = length[i + 1]; + if(rcpadding && i == dim() - 2) + { + lengthip1 = 2 * (lengthip1 / 2 + 1); + } + stride[i] = stride[i + 1] * lengthip1; + } + } + + return stride; + } + + void compute_istride() + { + istride = compute_stride(ilength(), + istride, + placement == fft_placement_inplace + && transform_type == fft_transform_type_real_forward); + } + + void compute_ostride() + { + ostride = compute_stride(olength(), + ostride, + placement == fft_placement_inplace + && transform_type == fft_transform_type_real_inverse); + } + + virtual void compute_isize() + { + auto il = ilength(); + size_t val = compute_ptrdiff(il, istride, nbatch, idist); + isize.resize(nibuffer()); + for(unsigned int i = 0; i < isize.size(); ++i) + { + isize[i] = val + ioffset[i]; + } + } + + virtual void compute_osize() + { + auto ol = olength(); + size_t val = compute_ptrdiff(ol, ostride, nbatch, odist); + osize.resize(nobuffer()); + for(unsigned int i = 0; i < osize.size(); ++i) + { + osize[i] = val + ooffset[i]; + } + } + + std::vector ibuffer_sizes() const + { + std::vector ibuffer_sizes; + + // In-place real-to-complex transforms need to have enough space in the input buffer to + // accomadate the output, which is slightly larger. + if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward) + { + return obuffer_sizes(); + } + + if(isize.empty()) + return ibuffer_sizes; + + switch(itype) + { + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + ibuffer_sizes.resize(2); + break; + default: + ibuffer_sizes.resize(1); + } + for(unsigned i = 0; i < ibuffer_sizes.size(); i++) + { + ibuffer_sizes[i] = isize[i] * var_size(precision, itype); + } + return ibuffer_sizes; + } + + virtual std::vector obuffer_sizes() const + { + std::vector obuffer_sizes; + + if(osize.empty()) + return obuffer_sizes; + + switch(otype) + { + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + obuffer_sizes.resize(2); + break; + default: + obuffer_sizes.resize(1); + } + for(unsigned i = 0; i < obuffer_sizes.size(); i++) + { + obuffer_sizes[i] = osize[i] * var_size(precision, otype); + } + return obuffer_sizes; + } + + // Compute the idist for a given transform based on the placeness, transform type, and data + // layout. + size_t compute_idist() const + { + size_t dist = 0; + // In-place 1D transforms need extra dist. + if(transform_type == fft_transform_type_real_forward && dim() == 1 + && placement == fft_placement_inplace) + { + dist = 2 * (length[0] / 2 + 1) * istride[0]; + return dist; + } + + if(transform_type == fft_transform_type_real_inverse && dim() == 1) + { + dist = (length[0] / 2 + 1) * istride[0]; + return dist; + } + + dist = (transform_type == fft_transform_type_real_inverse) + ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1] + : length[dim() - 1] * istride[dim() - 1]; + for(unsigned int i = 0; i < dim() - 1; ++i) + { + dist = std::max(length[i] * istride[i], dist); + } + return dist; + } + void set_idist() + { + if(idist != 0) + return; + idist = compute_idist(); + } + + // Compute the odist for a given transform based on the placeness, transform type, and data + // layout. Row-major. + size_t compute_odist() const + { + size_t dist = 0; + // In-place 1D transforms need extra dist. + if(transform_type == fft_transform_type_real_inverse && dim() == 1 + && placement == fft_placement_inplace) + { + dist = 2 * (length[0] / 2 + 1) * ostride[0]; + return dist; + } + + if(transform_type == fft_transform_type_real_forward && dim() == 1) + { + dist = (length[0] / 2 + 1) * ostride[0]; + return dist; + } + + dist = (transform_type == fft_transform_type_real_forward) + ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1] + : length[dim() - 1] * ostride[dim() - 1]; + for(unsigned int i = 0; i < dim() - 1; ++i) + { + dist = std::max(length[i] * ostride[i], dist); + } + return dist; + } + void set_odist() + { + if(odist != 0) + return; + odist = compute_odist(); + } + + // Put the length, stride, batch, and dist into a single length/stride array and pass off to the + // validity checker. + bool valid_length_stride_batch_dist(const std::vector& l0, + const std::vector& s0, + const size_t n, + const size_t dist, + const int verbose = 0) const + { + if(l0.size() != s0.size()) + return false; + + // Length and stride vectors, including bathes: + std::vector l{}, s{}; + for(unsigned int i = 0; i < l0.size(); ++i) + { + if(l0[i] > 1) + { + if(s0[i] == 0) + return false; + l.push_back(l0[i]); + s.push_back(s0[i]); + } + } + if(n > 1) + { + if(dist == 0) + return false; + l.push_back(n); + s.push_back(dist); + } + + return array_valid(l, s, verbose); + } + + // Return true if the given GPU parameters would produce a valid transform. + bool valid(const int verbose) const + { + if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer()) + return false; + + // Check that in-place transforms have the same input and output stride: + if(placement == fft_placement_inplace) + { + const auto stridesize = std::min(istride.size(), ostride.size()); + bool samestride = true; + for(unsigned int i = 0; i < stridesize; ++i) + { + if(istride[i] != ostride[i]) + samestride = false; + } + if((transform_type == fft_transform_type_complex_forward + || transform_type == fft_transform_type_complex_inverse) + && !samestride) + { + // In-place transforms require identical input and output strides. + if(verbose) + { + std::cout << "istride:"; + for(const auto& i : istride) + std::cout << " " << i; + std::cout << " ostride0:"; + for(const auto& i : ostride) + std::cout << " " << i; + std::cout << " differ; skipped for in-place transforms: skipping test" + << std::endl; + } + return false; + } + + if((transform_type == fft_transform_type_complex_forward + || transform_type == fft_transform_type_complex_inverse) + && (idist != odist) && nbatch > 1) + { + // In-place transforms require identical distance, if + // batch > 1. If batch is 1 then dist is ignored and + // the FFT should still work. + if(verbose) + { + std::cout << "idist:" << idist << " odist:" << odist + << " differ; skipped for in-place transforms: skipping test" + << std::endl; + } + return false; + } + + if((transform_type == fft_transform_type_real_forward + || transform_type == fft_transform_type_real_inverse) + && (istride.back() != 1 || ostride.back() != 1)) + { + // In-place real/complex transforms require unit strides. + if(verbose) + { + std::cout + << "istride.back(): " << istride.back() + << " ostride.back(): " << ostride.back() + << " must be unitary for in-place real/complex transforms: skipping test" + << std::endl; + } + return false; + } + + if((itype == fft_array_type_complex_interleaved + && otype == fft_array_type_complex_planar) + || (itype == fft_array_type_complex_planar + && otype == fft_array_type_complex_interleaved)) + { + if(verbose) + { + std::cout << "In-place c2c transforms require identical io types; skipped.\n"; + } + return false; + } + + // Check offsets + switch(transform_type) + { + case fft_transform_type_complex_forward: + case fft_transform_type_complex_inverse: + for(unsigned int i = 0; i < nibuffer(); ++i) + { + if(ioffset[i] != ooffset[i]) + return false; + } + break; + case fft_transform_type_real_forward: + if(ioffset[0] != 2 * ooffset[0]) + return false; + break; + case fft_transform_type_real_inverse: + if(2 * ioffset[0] != ooffset[0]) + return false; + break; + } + } + + if(!check_iotypes()) + return false; + + // we can only check output strides on out-of-place + // transforms, since we need to initialize output to a known + // pattern + if(placement == fft_placement_inplace && check_output_strides) + return false; + + // Check input and output strides + if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true) + { + if(verbose) + std::cout << "Invalid input data format.\n"; + return false; + } + if(!(ilength() == olength() && istride == ostride && idist == odist)) + { + // Only check if different + if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true) + { + if(verbose) + std::cout << "Invalid output data format.\n"; + return false; + } + } + + // The parameters are valid. + return true; + } + + // Fill in any missing parameters. + void validate() + { + set_iotypes(); + compute_istride(); + compute_ostride(); + set_idist(); + set_odist(); + compute_isize(); + compute_osize(); + + validate_fields(); + } + + virtual void validate_fields() const + { + if(!ifields.empty() || !ofields.empty()) + throw std::runtime_error("input/output fields are unsupported"); + } + + // Column-major getters: + std::vector length_cm() const + { + auto length_cm = length; + std::reverse(std::begin(length_cm), std::end(length_cm)); + return length_cm; + } + std::vector ilength_cm() const + { + auto ilength_cm = ilength(); + std::reverse(std::begin(ilength_cm), std::end(ilength_cm)); + return ilength_cm; + } + std::vector olength_cm() const + { + auto olength_cm = olength(); + std::reverse(std::begin(olength_cm), std::end(olength_cm)); + return olength_cm; + } + std::vector istride_cm() const + { + auto istride_cm = istride; + std::reverse(std::begin(istride_cm), std::end(istride_cm)); + return istride_cm; + } + std::vector ostride_cm() const + { + auto ostride_cm = ostride; + std::reverse(std::begin(ostride_cm), std::end(ostride_cm)); + return ostride_cm; + } + bool is_planar() const + { + if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar) + return true; + if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar) + return true; + return false; + } + + // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary. + template + inline void compute_input(std::vector& input) + { + auto deviceProp = get_curr_device_prop(); + + switch(precision) + { + case fft_precision_half: + set_input( + input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); + break; + case fft_precision_double: + set_input( + input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); + break; + case fft_precision_single: + set_input( + input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); + break; + } + } + + template + void print_ibuffer(const std::vector& buf, Tstream& stream = std::cout) const + { + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer> s; + s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); + break; + } + case fft_precision_single: + { + buffer_printer> s; + s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); + break; + } + case fft_precision_double: + { + buffer_printer> s; + s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); + break; + } + } + break; + } + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + case fft_array_type_real: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer<_Float16> s; + s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); + break; + } + case fft_precision_single: + { + buffer_printer s; + s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); + break; + } + case fft_precision_double: + { + buffer_printer s; + s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); + break; + } + } + break; + } + default: + throw std::runtime_error("Invalid itype in print_ibuffer"); + } + } + + template + void print_obuffer(const std::vector& buf, Tstream& stream = std::cout) const + { + switch(otype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer> s; + s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); + break; + } + case fft_precision_single: + { + buffer_printer> s; + s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); + break; + } + case fft_precision_double: + buffer_printer> s; + s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); + break; + } + break; + } + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + case fft_array_type_real: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer<_Float16> s; + s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); + break; + } + case fft_precision_single: + { + buffer_printer s; + s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); + break; + } + case fft_precision_double: + { + buffer_printer s; + s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); + break; + } + } + break; + } + + default: + throw std::runtime_error("Invalid itype in print_obuffer"); + } + } + + void print_ibuffer_flat(const std::vector& buf) const + { + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + case fft_precision_single: + { + buffer_printer> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + case fft_precision_double: + buffer_printer> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + break; + } + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + case fft_array_type_real: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer<_Float16> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + case fft_precision_single: + { + buffer_printer s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + case fft_precision_double: + { + buffer_printer s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + } + break; + default: + throw std::runtime_error("Invalid itype in print_ibuffer_flat"); + } + } + } + + void print_obuffer_flat(const std::vector& buf) const + { + switch(otype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + case fft_precision_single: + { + buffer_printer> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + case fft_precision_double: + buffer_printer> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + break; + } + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + case fft_array_type_real: + { + switch(precision) + { + case fft_precision_half: + { + buffer_printer<_Float16> s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + case fft_precision_single: + { + buffer_printer s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + + case fft_precision_double: + { + buffer_printer s; + s.print_buffer_flat(buf, osize, ooffset); + break; + } + } + break; + default: + throw std::runtime_error("Invalid itype in print_ibuffer_flat"); + } + } + } + + virtual fft_status set_callbacks(void* load_cb_host, + void* load_cb_data, + void* store_cb_host, + void* store_cb_data) + { + return fft_status_success; + } + + virtual fft_status execute(void** in, void** out) + { + return fft_status_success; + }; + + size_t fft_params_vram_footprint() + { + return fft_params::vram_footprint(); + } + + virtual size_t vram_footprint() + { + const auto ibuf_size = ibuffer_sizes(); + size_t val = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1); + if(placement == fft_placement_notinplace) + { + const auto obuf_size = obuffer_sizes(); + val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1); + } + return val; + } + + // Specific exception type for work buffer allocation failure. + // Tests that hit this can't fit on the GPU and should be skipped. + struct work_buffer_alloc_failure : public std::runtime_error + { + work_buffer_alloc_failure(const std::string& s) + : std::runtime_error(s) + { + } + }; + + virtual fft_status create_plan() + { + return fft_status_success; + } + + // Change a forward transform to it's inverse + void inverse_from_forward(fft_params& params_forward) + { + switch(params_forward.transform_type) + { + case fft_transform_type_complex_forward: + transform_type = fft_transform_type_complex_inverse; + break; + case fft_transform_type_real_forward: + transform_type = fft_transform_type_real_inverse; + break; + default: + throw std::runtime_error("Transform type not forward."); + } + + length = params_forward.length; + istride = params_forward.ostride; + ostride = params_forward.istride; + nbatch = params_forward.nbatch; + precision = params_forward.precision; + placement = params_forward.placement; + idist = params_forward.odist; + odist = params_forward.idist; + itype = params_forward.otype; + otype = params_forward.itype; + ioffset = params_forward.ooffset; + ooffset = params_forward.ioffset; + + run_callbacks = params_forward.run_callbacks; + + check_output_strides = params_forward.check_output_strides; + + scale_factor = 1 / params_forward.scale_factor; + } + + // prepare for multi-GPU transform. Generated input is in ibuffer. + // pibuffer, pobuffer are the pointers that will be passed to the + // FFT library's "execute" API. + virtual void multi_gpu_prepare(std::vector& ibuffer, + std::vector& pibuffer, + std::vector& pobuffer) + { + } + + // finalize multi-GPU transform. pobuffers are the pointers + // provided to the FFT library's "execute" API. obuffer is the + // buffer where transform output needs to go for validation + virtual void multi_gpu_finalize(std::vector& obuffer, std::vector& pobuffer) {} + + // create bricks in the specified field for the specified number + // of devices. The field is split along the highest FFT + // dimension, and the length only includes FFT lengths, not batch + // dimension. + void distribute_field(int deviceCount, + std::vector& fields, + const std::vector& field_length) + { + size_t slowLen = field_length.front(); + if(slowLen < static_cast(deviceCount)) + throw std::runtime_error("too many devices to distribute length " + + std::to_string(slowLen)); + + auto& field = fields.emplace_back(); + + for(int i = 0; i < deviceCount; ++i) + { + // start at origin + std::vector field_lower(field_length.size()); + std::vector field_upper(field_length.size()); + + // note: slowest FFT dim is index 0 in these coordinates + field_lower[0] = slowLen / deviceCount * i; + + // last brick needs to include the whole slow len + if(i == deviceCount - 1) + { + field_upper[0] = slowLen; + } + else + { + field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount); + } + + for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim) + { + field_upper[upperDim] = field_length[upperDim]; + } + + // field coordinates also need to include batch + field_lower.insert(field_lower.begin(), 0); + field_upper.insert(field_upper.begin(), nbatch); + + // bricks have contiguous strides + size_t brick_dist = 1; + std::vector brick_stride(field_lower.size()); + for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx) + { + // fill strides from fastest to slowest + *(brick_stride.rbegin() + distIdx) = brick_dist; + brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx); + } + field.bricks.push_back( + fft_params::fft_brick{field_lower, field_upper, brick_stride, i}); + } + } + + void distribute_input(int deviceCount) + { + distribute_field(deviceCount, ifields, length); + } + + void distribute_output(int deviceCount) + { + distribute_field(deviceCount, ofields, olength()); + } +}; + +// This is used with the program_options class so that the user can type an integer on the +// command line and we store into an enum varaible +template +std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, + fft_array_type& atype) +{ + unsigned tmp; + stream >> tmp; + atype = fft_array_type(tmp); + return stream; +} + +// similarly for transform type +template +std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, + fft_transform_type& ttype) +{ + unsigned tmp; + stream >> tmp; + ttype = fft_transform_type(tmp); + return stream; +} + +// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths +template +std::vector> partition_colmajor(const T1& length) +{ + return partition_base(length, compute_partition_count(length)); +} + +// Partition on the rightmost part of the tuple, for col-major indexing +template +std::vector, std::tuple>> + partition_colmajor(const std::tuple& length) +{ + auto partitions = partition_base(std::get<1>(length), compute_partition_count(length)); + std::vector, std::tuple>> ret(partitions.size()); + for(size_t i = 0; i < partitions.size(); ++i) + { + std::get<1>(ret[i].first) = partitions[i].first; + std::get<0>(ret[i].first) = 0; + std::get<1>(ret[i].second) = partitions[i].second; + std::get<0>(ret[i].second) = std::get<0>(length); + } + return ret; +} +template +std::vector, std::tuple>> + partition_colmajor(const std::tuple& length) +{ + auto partitions = partition_base(std::get<2>(length), compute_partition_count(length)); + std::vector, std::tuple>> ret(partitions.size()); + for(size_t i = 0; i < partitions.size(); ++i) + { + std::get<2>(ret[i].first) = partitions[i].first; + std::get<1>(ret[i].first) = 0; + std::get<0>(ret[i].first) = 0; + std::get<2>(ret[i].second) = partitions[i].second; + std::get<1>(ret[i].second) = std::get<1>(length); + std::get<0>(ret[i].second) = std::get<0>(length); + } + return ret; +} + +// Copy data of dimensions length with strides istride and length idist between batches to +// a buffer with strides ostride and length odist between batches. The input and output +// types are identical. +template +inline void copy_buffers_1to1(const Tval* input, + Tval* output, + const Tint1& whole_length, + const size_t nbatch, + const Tint2& istride, + const size_t idist, + const Tint3& ostride, + const size_t odist, + const std::vector& ioffset, + const std::vector& ooffset) +{ + const bool idx_equals_odx = istride == ostride && idist == odist; + size_t idx_base = 0; + size_t odx_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) + { +#ifdef _OPENMP +#pragma omp parallel for num_threads(partitions.size()) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto idx = compute_index(index, istride, idx_base); + const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); + output[odx + ooffset[0]] = input[idx + ioffset[0]]; + } while(increment_rowmajor(index, length)); + } + } +} + +// Copy data of dimensions length with strides istride and length idist between batches to +// a buffer with strides ostride and length odist between batches. The input type is +// planar and the output type is complex interleaved. +template +inline void copy_buffers_2to1(const Tval* input0, + const Tval* input1, + rocfft_complex* output, + const Tint1& whole_length, + const size_t nbatch, + const Tint2& istride, + const size_t idist, + const Tint3& ostride, + const size_t odist, + const std::vector& ioffset, + const std::vector& ooffset) +{ + const bool idx_equals_odx = istride == ostride && idist == odist; + size_t idx_base = 0; + size_t odx_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) + { +#ifdef _OPENMP +#pragma omp parallel for num_threads(partitions.size()) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto idx = compute_index(index, istride, idx_base); + const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); + output[odx + ooffset[0]] + = rocfft_complex(input0[idx + ioffset[0]], input1[idx + ioffset[1]]); + } while(increment_rowmajor(index, length)); + } + } +} + +// Copy data of dimensions length with strides istride and length idist between batches to +// a buffer with strides ostride and length odist between batches. The input type is +// complex interleaved and the output type is planar. +template +inline void copy_buffers_1to2(const rocfft_complex* input, + Tval* output0, + Tval* output1, + const Tint1& whole_length, + const size_t nbatch, + const Tint2& istride, + const size_t idist, + const Tint3& ostride, + const size_t odist, + const std::vector& ioffset, + const std::vector& ooffset) +{ + const bool idx_equals_odx = istride == ostride && idist == odist; + size_t idx_base = 0; + size_t odx_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) + { +#ifdef _OPENMP +#pragma omp parallel for num_threads(partitions.size()) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto idx = compute_index(index, istride, idx_base); + const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); + output0[odx + ooffset[0]] = input[idx + ioffset[0]].real(); + output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag(); + } while(increment_rowmajor(index, length)); + } + } +} + +// Copy data of dimensions length with strides istride and length idist between batches to +// a buffer with strides ostride and length odist between batches. The input type given +// by itype, and the output type is given by otype. +template +inline void copy_buffers(const std::vector& input, + std::vector& output, + const Tint1& length, + const size_t nbatch, + const fft_precision precision, + const fft_array_type itype, + const Tint2& istride, + const size_t idist, + const fft_array_type otype, + const Tint3& ostride, + const size_t odist, + const std::vector& ioffset, + const std::vector& ooffset) +{ + if(itype == otype) + { + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + switch(precision) + { + case fft_precision_half: + copy_buffers_1to1( + reinterpret_cast*>(input[0].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_single: + copy_buffers_1to1(reinterpret_cast*>(input[0].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_double: + copy_buffers_1to1(reinterpret_cast*>(input[0].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + } + break; + case fft_array_type_real: + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + for(unsigned int idx = 0; idx < input.size(); ++idx) + { + switch(precision) + { + case fft_precision_half: + copy_buffers_1to1(reinterpret_cast(input[idx].data()), + reinterpret_cast<_Float16*>(output[idx].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_single: + copy_buffers_1to1(reinterpret_cast(input[idx].data()), + reinterpret_cast(output[idx].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_double: + copy_buffers_1to1(reinterpret_cast(input[idx].data()), + reinterpret_cast(output[idx].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + } + } + break; + default: + throw std::runtime_error("Invalid data type"); + } + } + else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) + || (itype == fft_array_type_hermitian_interleaved + && otype == fft_array_type_hermitian_planar)) + { + // copy 1to2 + switch(precision) + { + case fft_precision_half: + copy_buffers_1to2(reinterpret_cast*>(input[0].data()), + reinterpret_cast<_Float16*>(output[0].data()), + reinterpret_cast<_Float16*>(output[1].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_single: + copy_buffers_1to2(reinterpret_cast*>(input[0].data()), + reinterpret_cast(output[0].data()), + reinterpret_cast(output[1].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_double: + copy_buffers_1to2(reinterpret_cast*>(input[0].data()), + reinterpret_cast(output[0].data()), + reinterpret_cast(output[1].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + } + } + else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) + || (itype == fft_array_type_hermitian_planar + && otype == fft_array_type_hermitian_interleaved)) + { + // copy 2 to 1 + switch(precision) + { + case fft_precision_half: + copy_buffers_2to1(reinterpret_cast(input[0].data()), + reinterpret_cast(input[1].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_single: + copy_buffers_2to1(reinterpret_cast(input[0].data()), + reinterpret_cast(input[1].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + case fft_precision_double: + copy_buffers_2to1(reinterpret_cast(input[0].data()), + reinterpret_cast(input[1].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + ioffset, + ooffset); + break; + } + } + else + { + throw std::runtime_error("Invalid input and output types."); + } +} + +// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions +template +inline void copy_buffers(const std::vector& input, + std::vector& output, + const std::vector& length, + const size_t nbatch, + const fft_precision precision, + const fft_array_type itype, + const std::vector& istride, + const size_t idist, + const fft_array_type otype, + const std::vector& ostride, + const size_t odist, + const std::vector& ioffset, + const std::vector& ooffset) +{ + switch(length.size()) + { + case 1: + return copy_buffers(input, + output, + length[0], + nbatch, + precision, + itype, + istride[0], + idist, + otype, + ostride[0], + odist, + ioffset, + ooffset); + case 2: + return copy_buffers(input, + output, + std::make_tuple(length[0], length[1]), + nbatch, + precision, + itype, + std::make_tuple(istride[0], istride[1]), + idist, + otype, + std::make_tuple(ostride[0], ostride[1]), + odist, + ioffset, + ooffset); + case 3: + return copy_buffers(input, + output, + std::make_tuple(length[0], length[1], length[2]), + nbatch, + precision, + itype, + std::make_tuple(istride[0], istride[1], istride[2]), + idist, + otype, + std::make_tuple(ostride[0], ostride[1], ostride[2]), + odist, + ioffset, + ooffset); + default: + abort(); + } +} + +// Compute the L-infinity and L-2 distance between two buffers with strides istride and +// length idist between batches to a buffer with strides ostride and length odist between +// batches. Both buffers are of complex type. + +struct VectorNorms +{ + double l_2 = 0.0, l_inf = 0.0; +}; + +template +inline VectorNorms distance_1to1_complex(const Tcomplex* input, + const Tcomplex* output, + const Tint1& whole_length, + const size_t nbatch, + const Tint2& istride, + const size_t idist, + const Tint3& ostride, + const size_t odist, + std::vector>* linf_failures, + const double linf_cutoff, + const std::vector& ioffset, + const std::vector& ooffset, + const double output_scalar = 1.0) +{ + double linf = 0.0; + double l2 = 0.0; + + std::mutex linf_failure_lock; + std::vector> linf_failures_private; + + const bool idx_equals_odx = istride == ostride && idist == odist; + size_t idx_base = 0; + size_t odx_base = 0; + auto partitions = partition_colmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) + { +#ifdef _OPENMP +#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + double cur_linf = 0.0; + double cur_l2 = 0.0; + auto index = partitions[part].first; + const auto length = partitions[part].second; + + do + { + const auto idx = compute_index(index, istride, idx_base); + const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); + const double rdiff + = std::abs(static_cast(output[odx + ooffset[0]].real()) * output_scalar + - static_cast(input[idx + ioffset[0]].real())); + cur_linf = std::max(rdiff, cur_linf); + if(cur_linf > linf_cutoff) + { + std::pair fval(b, idx); + if(linf_failures) + linf_failures_private.push_back(fval); + } + cur_l2 += rdiff * rdiff; + + const double idiff + = std::abs(static_cast(output[odx + ooffset[0]].imag()) * output_scalar + - static_cast(input[idx + ioffset[0]].imag())); + cur_linf = std::max(idiff, cur_linf); + if(cur_linf > linf_cutoff) + { + std::pair fval(b, idx); + if(linf_failures) + linf_failures_private.push_back(fval); + } + cur_l2 += idiff * idiff; + + } while(increment_rowmajor(index, length)); + linf = std::max(linf, cur_linf); + l2 += cur_l2; + + if(linf_failures) + { + linf_failure_lock.lock(); + std::copy(linf_failures_private.begin(), + linf_failures_private.end(), + std::back_inserter(*linf_failures)); + linf_failure_lock.unlock(); + } + } + } + return {.l_2 = sqrt(l2), .l_inf = linf}; +} + +// Compute the L-infinity and L-2 distance between two buffers with strides istride and +// length idist between batches to a buffer with strides ostride and length odist between +// batches. Both buffers are of real type. +template +inline VectorNorms distance_1to1_real(const Tfloat* input, + const Tfloat* output, + const Tint1& whole_length, + const size_t nbatch, + const Tint2& istride, + const size_t idist, + const Tint3& ostride, + const size_t odist, + std::vector>* linf_failures, + const double linf_cutoff, + const std::vector& ioffset, + const std::vector& ooffset, + const double output_scalar = 1.0) +{ + double linf = 0.0; + double l2 = 0.0; + + std::mutex linf_failure_lock; + std::vector> linf_failures_private; + + const bool idx_equals_odx = istride == ostride && idist == odist; + size_t idx_base = 0; + size_t odx_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) + { +#ifdef _OPENMP +#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + double cur_linf = 0.0; + double cur_l2 = 0.0; + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto idx = compute_index(index, istride, idx_base); + const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); + const double diff + = std::abs(static_cast(output[odx + ooffset[0]]) * output_scalar + - static_cast(input[idx + ioffset[0]])); + cur_linf = std::max(diff, cur_linf); + if(cur_linf > linf_cutoff) + { + std::pair fval(b, idx); + if(linf_failures) + linf_failures_private.push_back(fval); + } + cur_l2 += diff * diff; + + } while(increment_rowmajor(index, length)); + linf = std::max(linf, cur_linf); + l2 += cur_l2; + + if(linf_failures) + { + linf_failure_lock.lock(); + std::copy(linf_failures_private.begin(), + linf_failures_private.end(), + std::back_inserter(*linf_failures)); + linf_failure_lock.unlock(); + } + } + } + return {.l_2 = sqrt(l2), .l_inf = linf}; +} + +// Compute the L-infinity and L-2 distance between two buffers with strides istride and +// length idist between batches to a buffer with strides ostride and length odist between +// batches. input is complex-interleaved, output is complex-planar. +template +inline VectorNorms distance_1to2(const rocfft_complex* input, + const Tval* output0, + const Tval* output1, + const Tint1& whole_length, + const size_t nbatch, + const T2& istride, + const size_t idist, + const T3& ostride, + const size_t odist, + std::vector>* linf_failures, + const double linf_cutoff, + const std::vector& ioffset, + const std::vector& ooffset, + const double output_scalar = 1.0) +{ + double linf = 0.0; + double l2 = 0.0; + + std::mutex linf_failure_lock; + std::vector> linf_failures_private; + + const bool idx_equals_odx = istride == ostride && idist == odist; + size_t idx_base = 0; + size_t odx_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) + { +#ifdef _OPENMP +#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + double cur_linf = 0.0; + double cur_l2 = 0.0; + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto idx = compute_index(index, istride, idx_base); + const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); + const double rdiff + = std::abs(static_cast(output0[odx + ooffset[0]]) * output_scalar + - static_cast(input[idx + ioffset[0]].real())); + cur_linf = std::max(rdiff, cur_linf); + if(cur_linf > linf_cutoff) + { + std::pair fval(b, idx); + if(linf_failures) + linf_failures_private.push_back(fval); + } + cur_l2 += rdiff * rdiff; + + const double idiff + = std::abs(static_cast(output1[odx + ooffset[1]]) * output_scalar + - static_cast(input[idx + ioffset[0]].imag())); + cur_linf = std::max(idiff, cur_linf); + if(cur_linf > linf_cutoff) + { + std::pair fval(b, idx); + if(linf_failures) + linf_failures_private.push_back(fval); + } + cur_l2 += idiff * idiff; + + } while(increment_rowmajor(index, length)); + linf = std::max(linf, cur_linf); + l2 += cur_l2; + + if(linf_failures) + { + linf_failure_lock.lock(); + std::copy(linf_failures_private.begin(), + linf_failures_private.end(), + std::back_inserter(*linf_failures)); + linf_failure_lock.unlock(); + } + } + } + return {.l_2 = sqrt(l2), .l_inf = linf}; +} + +// Compute the L-inifnity and L-2 distance between two buffers of dimension length and +// with types given by itype, otype, and precision. +template +inline VectorNorms distance(const std::vector& input, + const std::vector& output, + const Tint1& length, + const size_t nbatch, + const fft_precision precision, + const fft_array_type itype, + const Tint2& istride, + const size_t idist, + const fft_array_type otype, + const Tint3& ostride, + const size_t odist, + std::vector>* linf_failures, + const double linf_cutoff, + const std::vector& ioffset, + const std::vector& ooffset, + const double output_scalar = 1.0) +{ + VectorNorms dist; + + if(itype == otype) + { + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + switch(precision) + { + case fft_precision_half: + dist = distance_1to1_complex( + reinterpret_cast*>(input[0].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_single: + dist = distance_1to1_complex( + reinterpret_cast*>(input[0].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_double: + dist = distance_1to1_complex( + reinterpret_cast*>(input[0].data()), + reinterpret_cast*>(output[0].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + } + dist.l_2 *= dist.l_2; + break; + case fft_array_type_real: + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + for(unsigned int idx = 0; idx < input.size(); ++idx) + { + VectorNorms d; + switch(precision) + { + case fft_precision_half: + d = distance_1to1_real(reinterpret_cast(input[idx].data()), + reinterpret_cast(output[idx].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_single: + d = distance_1to1_real(reinterpret_cast(input[idx].data()), + reinterpret_cast(output[idx].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_double: + d = distance_1to1_real(reinterpret_cast(input[idx].data()), + reinterpret_cast(output[idx].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + } + dist.l_inf = std::max(d.l_inf, dist.l_inf); + dist.l_2 += d.l_2 * d.l_2; + } + break; + default: + throw std::runtime_error("Invalid input and output types."); + } + } + else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) + || (itype == fft_array_type_hermitian_interleaved + && otype == fft_array_type_hermitian_planar)) + { + switch(precision) + { + case fft_precision_half: + dist = distance_1to2(reinterpret_cast*>(input[0].data()), + reinterpret_cast(output[0].data()), + reinterpret_cast(output[1].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_single: + dist = distance_1to2(reinterpret_cast*>(input[0].data()), + reinterpret_cast(output[0].data()), + reinterpret_cast(output[1].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_double: + dist = distance_1to2(reinterpret_cast*>(input[0].data()), + reinterpret_cast(output[0].data()), + reinterpret_cast(output[1].data()), + length, + nbatch, + istride, + idist, + ostride, + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + } + dist.l_2 *= dist.l_2; + } + else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) + || (itype == fft_array_type_hermitian_planar + && otype == fft_array_type_hermitian_interleaved)) + { + switch(precision) + { + case fft_precision_half: + dist + = distance_1to2(reinterpret_cast*>(output[0].data()), + reinterpret_cast(input[0].data()), + reinterpret_cast(input[1].data()), + length, + nbatch, + ostride, + odist, + istride, + idist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_single: + dist = distance_1to2(reinterpret_cast*>(output[0].data()), + reinterpret_cast(input[0].data()), + reinterpret_cast(input[1].data()), + length, + nbatch, + ostride, + odist, + istride, + idist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + case fft_precision_double: + dist = distance_1to2(reinterpret_cast*>(output[0].data()), + reinterpret_cast(input[0].data()), + reinterpret_cast(input[1].data()), + length, + nbatch, + ostride, + odist, + istride, + idist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + break; + } + dist.l_2 *= dist.l_2; + } + else + { + throw std::runtime_error("Invalid input and output types."); + } + dist.l_2 = sqrt(dist.l_2); + return dist; +} + +// check if the specified length + stride/dist is contiguous +template +bool is_contiguous_rowmajor(const std::vector& length, + const std::vector& stride, + size_t dist) +{ + size_t expected_stride = 1; + auto stride_it = stride.rbegin(); + auto length_it = length.rbegin(); + for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it) + { + if(*stride_it != expected_stride) + return false; + expected_stride *= *length_it; + } + return expected_stride == dist; +} + +// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions +template +inline VectorNorms distance(const std::vector& input, + const std::vector& output, + std::vector length, + size_t nbatch, + const fft_precision precision, + const fft_array_type itype, + std::vector istride, + const size_t idist, + const fft_array_type otype, + std::vector ostride, + const size_t odist, + std::vector>* linf_failures, + const double linf_cutoff, + const std::vector& ioffset, + const std::vector& ooffset, + const double output_scalar = 1.0) +{ + // If istride and ostride are both contiguous, collapse them down + // to one dimension. Index calculation is simpler (and faster) + // in the 1D case. + if(is_contiguous_rowmajor(length, istride, idist) + && is_contiguous_rowmajor(length, ostride, odist)) + { + length = {product(length.begin(), length.end()) * nbatch}; + istride = {static_cast(1)}; + ostride = {static_cast(1)}; + nbatch = 1; + } + + switch(length.size()) + { + case 1: + return distance(input, + output, + length[0], + nbatch, + precision, + itype, + istride[0], + idist, + otype, + ostride[0], + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + case 2: + return distance(input, + output, + std::make_tuple(length[0], length[1]), + nbatch, + precision, + itype, + std::make_tuple(istride[0], istride[1]), + idist, + otype, + std::make_tuple(ostride[0], ostride[1]), + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + case 3: + return distance(input, + output, + std::make_tuple(length[0], length[1], length[2]), + nbatch, + precision, + itype, + std::make_tuple(istride[0], istride[1], istride[2]), + idist, + otype, + std::make_tuple(ostride[0], ostride[1], ostride[2]), + odist, + linf_failures, + linf_cutoff, + ioffset, + ooffset, + output_scalar); + default: + abort(); + } +} + +// Compute the L-infinity and L-2 norm of a buffer with strides istride and +// length idist. Data is rocfft_complex. +template +inline VectorNorms norm_complex(const Tcomplex* input, + const T1& whole_length, + const size_t nbatch, + const T2& istride, + const size_t idist, + const std::vector& offset) +{ + double linf = 0.0; + double l2 = 0.0; + + size_t idx_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist) + { +#ifdef _OPENMP +#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + double cur_linf = 0.0; + double cur_l2 = 0.0; + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto idx = compute_index(index, istride, idx_base); + + const double rval = std::abs(static_cast(input[idx + offset[0]].real())); + cur_linf = std::max(rval, cur_linf); + cur_l2 += rval * rval; + + const double ival = std::abs(static_cast(input[idx + offset[0]].imag())); + cur_linf = std::max(ival, cur_linf); + cur_l2 += ival * ival; + + } while(increment_rowmajor(index, length)); + linf = std::max(linf, cur_linf); + l2 += cur_l2; + } + } + return {.l_2 = sqrt(l2), .l_inf = linf}; +} + +// Compute the L-infinity and L-2 norm of abuffer with strides istride and +// length idist. Data is real-valued. +template +inline VectorNorms norm_real(const Tfloat* input, + const T1& whole_length, + const size_t nbatch, + const T2& istride, + const size_t idist, + const std::vector& offset) +{ + double linf = 0.0; + double l2 = 0.0; + + size_t idx_base = 0; + auto partitions = partition_rowmajor(whole_length); + for(size_t b = 0; b < nbatch; b++, idx_base += idist) + { +#ifdef _OPENMP +#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) +#endif + for(size_t part = 0; part < partitions.size(); ++part) + { + double cur_linf = 0.0; + double cur_l2 = 0.0; + auto index = partitions[part].first; + const auto length = partitions[part].second; + do + { + const auto idx = compute_index(index, istride, idx_base); + const double val = std::abs(static_cast(input[idx + offset[0]])); + cur_linf = std::max(val, cur_linf); + cur_l2 += val * val; + + } while(increment_rowmajor(index, length)); + linf = std::max(linf, cur_linf); + l2 += cur_l2; + } + } + return {.l_2 = sqrt(l2), .l_inf = linf}; +} + +// Compute the L-infinity and L-2 norm of abuffer with strides istride and +// length idist. Data format is given by precision and itype. +template +inline VectorNorms norm(const std::vector& input, + const T1& length, + const size_t nbatch, + const fft_precision precision, + const fft_array_type itype, + const T2& istride, + const size_t idist, + const std::vector& offset) +{ + VectorNorms norm; + + switch(itype) + { + case fft_array_type_complex_interleaved: + case fft_array_type_hermitian_interleaved: + switch(precision) + { + case fft_precision_half: + norm = norm_complex(reinterpret_cast*>(input[0].data()), + length, + nbatch, + istride, + idist, + offset); + break; + case fft_precision_single: + norm = norm_complex(reinterpret_cast*>(input[0].data()), + length, + nbatch, + istride, + idist, + offset); + break; + case fft_precision_double: + norm = norm_complex(reinterpret_cast*>(input[0].data()), + length, + nbatch, + istride, + idist, + offset); + break; + } + norm.l_2 *= norm.l_2; + break; + case fft_array_type_real: + case fft_array_type_complex_planar: + case fft_array_type_hermitian_planar: + for(unsigned int idx = 0; idx < input.size(); ++idx) + { + VectorNorms n; + switch(precision) + { + case fft_precision_half: + n = norm_real(reinterpret_cast(input[idx].data()), + length, + nbatch, + istride, + idist, + offset); + break; + case fft_precision_single: + n = norm_real(reinterpret_cast(input[idx].data()), + length, + nbatch, + istride, + idist, + offset); + break; + case fft_precision_double: + n = norm_real(reinterpret_cast(input[idx].data()), + length, + nbatch, + istride, + idist, + offset); + break; + } + norm.l_inf = std::max(n.l_inf, norm.l_inf); + norm.l_2 += n.l_2 * n.l_2; + } + break; + default: + throw std::runtime_error("Invalid data type"); + } + + norm.l_2 = sqrt(norm.l_2); + return norm; +} + +// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions +template +inline VectorNorms norm(const std::vector& input, + std::vector length, + size_t nbatch, + const fft_precision precision, + const fft_array_type type, + std::vector stride, + const size_t dist, + const std::vector& offset) +{ + // If stride is contiguous, collapse it down to one dimension. + // Index calculation is simpler (and faster) in the 1D case. + if(is_contiguous_rowmajor(length, stride, dist)) + { + length = {product(length.begin(), length.end()) * nbatch}; + stride = {static_cast(1)}; + nbatch = 1; + } + + switch(length.size()) + { + case 1: + return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset); + case 2: + return norm(input, + std::make_tuple(length[0], length[1]), + nbatch, + precision, + type, + std::make_tuple(stride[0], stride[1]), + dist, + offset); + case 3: + return norm(input, + std::make_tuple(length[0], length[1], length[2]), + nbatch, + precision, + type, + std::make_tuple(stride[0], stride[1], stride[2]), + dist, + offset); + default: + abort(); + } +} + +// Given a data type and precision, the distance between batches, and +// the batch size, allocate the required host buffer(s). +static std::vector allocate_host_buffer(const fft_precision precision, + const fft_array_type type, + const std::vector& size) +{ + std::vector buffers(size.size()); + for(unsigned int i = 0; i < size.size(); ++i) + { + buffers[i].alloc(size[i] * var_size(precision, type)); + } + return buffers; +} + +// Check if the required buffers fit in the device vram. +inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0) +{ + // We keep a small margin of error for fitting the problem into vram: + const size_t extra = 1 << 27; + + return vram_avail > prob_size + extra; +} + +// Computes the twiddle table VRAM footprint for r2c/c2r transforms. +// This function will return 0 for the other transform types, since +// the VRAM footprint in rocFFT is negligible for the other cases. +inline size_t twiddle_table_vram_footprint(const fft_params& params) +{ + size_t vram_footprint = 0; + + // Add vram footprint from real/complex even twiddle buffer size. + if(params.transform_type == fft_transform_type_real_forward + || params.transform_type == fft_transform_type_real_inverse) + { + const auto realdim = params.length.back(); + if(realdim % 2 == 0) + { + const auto complex_size = params.precision == fft_precision_single ? 8 : 16; + // even length twiddle size is 1/4 of the real size, but + // in complex elements + vram_footprint += realdim * complex_size / 4; + } + } + + return vram_footprint; +} + +#endif diff --git a/shared/fftw_transform.h b/shared/fftw_transform.h new file mode 100644 index 0000000..873a373 --- /dev/null +++ b/shared/fftw_transform.h @@ -0,0 +1,493 @@ +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once +#ifndef FFTWTRANSFORM_H +#define FFTWTRANSFORM_H + +#include "hostbuf.h" +#include "rocfft_complex.h" +#include "test_params.h" +#include +#include + +// Function to return maximum error for float and double types. +// +// Following Schatzman (1996; Accuracy of the Discrete Fourier +// Transform and the Fast Fourier Transform), the shape of relative +// l_2 error vs length should look like +// +// epsilon * sqrt(log2(length)). +// +// The magic epsilon constants below were chosen so that we get a +// reasonable upper bound for (all of) our tests. +// +// For rocFFT, prime lengths result in the highest error. As such, +// the epsilons below are perhaps too loose for pow2 lengths; but they +// are appropriate for prime lengths. +template +inline double type_epsilon(); +template <> +inline double type_epsilon<_Float16>() +{ + return half_epsilon; +} +template <> +inline double type_epsilon() +{ + return single_epsilon; +} +template <> +inline double type_epsilon() +{ + return double_epsilon; +} + +// C++ traits to translate float->fftwf_complex and +// double->fftw_complex. +// The correct FFTW complex type can be accessed via, for example, +// using complex_t = typename fftw_complex_trait::complex_t; +template +struct fftw_trait; +template <> +struct fftw_trait<_Float16> +{ + // fftw does not support half precision, so use single precision and convert + using fftw_complex_type = fftwf_complex; + using fftw_plan_type = fftwf_plan; +}; +template <> +struct fftw_trait +{ + using fftw_complex_type = fftwf_complex; + using fftw_plan_type = fftwf_plan; +}; +template <> +struct fftw_trait +{ + using fftw_complex_type = fftw_complex; + using fftw_plan_type = fftw_plan; +}; + +// Copies the half-precision input buffer to a single-precision +// buffer. Note that the input buffer is already sized like it's a +// single-precision buffer (but only half of it is filled), because +// we allocate a single-precision buffer for FFTW to plan with. +static hostbuf half_to_single_copy(const hostbuf& in) +{ + auto out = in.copy(); + auto in_begin = reinterpret_cast(in.data()); + std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast(out.data())); + return out; +} + +// converts a wider precision buffer to a narrower precision, in-place +template +void narrow_precision_inplace(hostbuf& in) +{ + // ensure we're actually shrinking the data + static_assert(sizeof(TfloatIn) > sizeof(TfloatOut)); + + auto readPtr = reinterpret_cast(in.data()); + auto writePtr = reinterpret_cast(in.data()); + std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr); + in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut))); +} + +static void single_to_half_inplace(hostbuf& in) +{ + narrow_precision_inplace(in); +} + +// Template wrappers for real-valued FFTW allocators: +template +inline Tfloat* fftw_alloc_real_type(size_t n); +template <> +inline float* fftw_alloc_real_type(size_t n) +{ + return fftwf_alloc_real(n); +} +template <> +inline double* fftw_alloc_real_type(size_t n) +{ + return fftw_alloc_real(n); +} + +// Template wrappers for complex-valued FFTW allocators: +template +inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n); +template <> +inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n) +{ + return fftwf_alloc_complex(n); +} +template <> +inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n) +{ + return fftw_alloc_complex(n); +} + +template +inline fftw_type* fftw_alloc_type(size_t n); +template <> +inline float* fftw_alloc_type(size_t n) +{ + return fftw_alloc_real_type(n); +} +template <> +inline double* fftw_alloc_type(size_t n) +{ + return fftw_alloc_real_type(n); +} +template <> +inline fftwf_complex* fftw_alloc_type(size_t n) +{ + return fftw_alloc_complex_type(n); +} +template <> +inline fftw_complex* fftw_alloc_type(size_t n) +{ + return fftw_alloc_complex_type(n); +} +template <> +inline rocfft_complex* fftw_alloc_type>(size_t n) +{ + return (rocfft_complex*)fftw_alloc_complex_type(n); +} +template <> +inline rocfft_complex* fftw_alloc_type>(size_t n) +{ + return (rocfft_complex*)fftw_alloc_complex_type(n); +} + +// Template wrappers for FFTW plan executors: +template +inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan); +template <> +inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan) +{ + return fftwf_execute(plan); +} +template <> +inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan) +{ + return fftw_execute(plan); +} + +// Template wrappers for FFTW plan destroyers: +template +inline void fftw_destroy_plan_type(Tfftw_plan plan); +template <> +inline void fftw_destroy_plan_type(fftwf_plan plan) +{ + return fftwf_destroy_plan(plan); +} +template <> +inline void fftw_destroy_plan_type(fftw_plan plan) +{ + return fftw_destroy_plan(plan); +} + +// Template wrappers for FFTW c2c planners: +template +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_dft(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait::fftw_complex_type* in, + typename fftw_trait::fftw_complex_type* out, + int sign, + unsigned flags); + +template <> +inline typename fftw_trait<_Float16>::fftw_plan_type + fftw_plan_guru64_dft<_Float16>(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait<_Float16>::fftw_complex_type* in, + typename fftw_trait<_Float16>::fftw_complex_type* out, + int sign, + unsigned flags) +{ + return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); +} + +template <> +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_dft(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait::fftw_complex_type* in, + typename fftw_trait::fftw_complex_type* out, + int sign, + unsigned flags) +{ + return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); +} + +template <> +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_dft(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait::fftw_complex_type* in, + typename fftw_trait::fftw_complex_type* out, + int sign, + unsigned flags) +{ + return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); +} + +// Template wrappers for FFTW c2c executors: +template +inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out); + +template <> +inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + // since FFTW does not natively support half precision, convert + // input to single, execute, then convert output back to half + auto in_single = half_to_single_copy(in.front()); + fftwf_execute_dft(plan, + reinterpret_cast(in_single.data()), + reinterpret_cast(out.front().data())); + single_to_half_inplace(out.front()); +} + +template <> +inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftwf_execute_dft(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} + +template <> +inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftw_execute_dft(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} + +// Template wrappers for FFTW r2c planners: +template +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_r2c(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + Tfloat* in, + typename fftw_trait::fftw_complex_type* out, + unsigned flags); +template <> +inline typename fftw_trait<_Float16>::fftw_plan_type + fftw_plan_guru64_r2c<_Float16>(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + _Float16* in, + typename fftw_trait<_Float16>::fftw_complex_type* out, + unsigned flags) +{ + return fftwf_plan_guru64_dft_r2c( + rank, dims, howmany_rank, howmany_dims, reinterpret_cast(in), out, flags); +} +template <> +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_r2c(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + float* in, + typename fftw_trait::fftw_complex_type* out, + unsigned flags) +{ + return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); +} +template <> +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_r2c(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + double* in, + typename fftw_trait::fftw_complex_type* out, + unsigned flags) +{ + return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); +} + +// Template wrappers for FFTW r2c executors: +template +inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out); +template <> +inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + // since FFTW does not natively support half precision, convert + // input to single, execute, then convert output back to half + auto in_single = half_to_single_copy(in.front()); + fftwf_execute_dft_r2c(plan, + reinterpret_cast(in_single.data()), + reinterpret_cast(out.front().data())); + single_to_half_inplace(out.front()); +} +template <> +inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftwf_execute_dft_r2c(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} +template <> +inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftw_execute_dft_r2c(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} + +// Template wrappers for FFTW c2r planners: +template +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_c2r(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait::fftw_complex_type* in, + Tfloat* out, + unsigned flags); +template <> +inline typename fftw_trait<_Float16>::fftw_plan_type + fftw_plan_guru64_c2r<_Float16>(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait<_Float16>::fftw_complex_type* in, + _Float16* out, + unsigned flags) +{ + return fftwf_plan_guru64_dft_c2r( + rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast(out), flags); +} +template <> +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_c2r(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait::fftw_complex_type* in, + float* out, + unsigned flags) +{ + return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); +} +template <> +inline typename fftw_trait::fftw_plan_type + fftw_plan_guru64_c2r(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait::fftw_complex_type* in, + double* out, + unsigned flags) +{ + return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); +} + +// Template wrappers for FFTW c2r executors: +template +inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out); +template <> +inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + // since FFTW does not natively support half precision, convert + // input to single, execute, then convert output back to half + auto in_single = half_to_single_copy(in.front()); + fftwf_execute_dft_c2r(plan, + reinterpret_cast(in_single.data()), + reinterpret_cast(out.front().data())); + single_to_half_inplace(out.front()); +} +template <> +inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftwf_execute_dft_c2r(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} +template <> +inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftw_execute_dft_c2r(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} + +#ifdef FFTW_HAVE_SPRINT_PLAN +// Template wrappers for FFTW print plan: +template +inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan); +template <> +inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan) +{ + return fftwf_sprint_plan(plan); +} +template <> +inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) +{ + return fftwf_sprint_plan(plan); +} +template <> +inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) +{ + return fftw_sprint_plan(plan); +} +#endif + +#endif diff --git a/shared/gpubuf.h b/shared/gpubuf.h new file mode 100644 index 0000000..993fa95 --- /dev/null +++ b/shared/gpubuf.h @@ -0,0 +1,134 @@ +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_GPUBUF_H +#define ROCFFT_GPUBUF_H + +#include "rocfft_hip.h" +#include + +// Simple RAII class for GPU buffers. T is the type of pointer that +// data() returns +template +class gpubuf_t +{ +public: + gpubuf_t() {} + // buffers are movable but not copyable + gpubuf_t(gpubuf_t&& other) + { + std::swap(buf, other.buf); + std::swap(bsize, other.bsize); + std::swap(device, other.device); + } + gpubuf_t& operator=(gpubuf_t&& other) + { + std::swap(buf, other.buf); + std::swap(bsize, other.bsize); + std::swap(device, other.device); + return *this; + } + gpubuf_t(const gpubuf_t&) = delete; + gpubuf_t& operator=(const gpubuf_t&) = delete; + + ~gpubuf_t() + { + free(); + } + + static bool use_alloc_managed() + { + return std::getenv("ROCFFT_MALLOC_MANAGED"); + } + + hipError_t alloc(const size_t size) + { + // remember the device that was current as of alloc, so we can + // free on the correct device + auto ret = hipGetDevice(&device); + if(ret != hipSuccess) + return ret; + + bsize = size; + static bool alloc_managed = use_alloc_managed(); + free(); + ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize); + if(ret != hipSuccess) + { + buf = nullptr; + bsize = 0; + } + return ret; + } + + size_t size() const + { + return bsize; + } + + void free() + { + if(buf != nullptr) + { + // free on the device we allocated on + rocfft_scoped_device dev(device); + (void)hipFree(buf); + buf = nullptr; + bsize = 0; + } + } + + // return a pointer to the allocated memory, offset by the + // specified number of bytes + T* data_offset(size_t offset_bytes = 0) const + { + void* ptr = static_cast(buf) + offset_bytes; + return static_cast(ptr); + } + + T* data() const + { + return static_cast(buf); + } + + // equality/bool tests + bool operator==(std::nullptr_t n) const + { + return buf == n; + } + bool operator!=(std::nullptr_t n) const + { + return buf != n; + } + operator bool() const + { + return buf; + } + +private: + // The GPU buffer + void* buf = nullptr; + size_t bsize = 0; + int device = 0; +}; + +// default gpubuf that gives out void* pointers +typedef gpubuf_t<> gpubuf; +#endif diff --git a/shared/hip_object_wrapper.h b/shared/hip_object_wrapper.h new file mode 100644 index 0000000..54083ab --- /dev/null +++ b/shared/hip_object_wrapper.h @@ -0,0 +1,86 @@ +/****************************************************************************** +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +* THE SOFTWARE. +*******************************************************************************/ + +#ifndef ROCFFT_HIP_OBJ_WRAPPER_H +#define ROCFFT_HIP_OBJ_WRAPPER_H + +#include "rocfft_hip.h" + +// RAII wrapper around HIP objects +template +struct hip_object_wrapper_t +{ + hip_object_wrapper_t() + : obj(nullptr) + { + } + + void alloc() + { + if(obj == nullptr && TCreate(&obj) != hipSuccess) + throw std::runtime_error("hip create failure"); + } + + void free() + { + if(obj) + { + (void)TDestroy(obj); + obj = nullptr; + } + } + + operator const T&() const + { + return obj; + } + operator T&() + { + return obj; + } + + operator bool() const + { + return obj != nullptr; + } + + ~hip_object_wrapper_t() + { + free(); + } + + hip_object_wrapper_t(const hip_object_wrapper_t&) = delete; + hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete; + hip_object_wrapper_t(hip_object_wrapper_t&& other) + : obj(other.obj) + { + other.obj = nullptr; + } + +private: + T obj; +}; + +typedef hip_object_wrapper_t hipStream_wrapper_t; +typedef hip_object_wrapper_t hipEvent_wrapper_t; + +#endif // ROCFFT_HIP_OBJ_WRAPPER_H diff --git a/shared/hostbuf.h b/shared/hostbuf.h new file mode 100644 index 0000000..0a96c7d --- /dev/null +++ b/shared/hostbuf.h @@ -0,0 +1,158 @@ +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_HOSTBUF_H +#define ROCFFT_HOSTBUF_H + +#include "arithmetic.h" +#include +#include + +#ifndef WIN32 +#include +#include +#endif + +// Simple RAII class for host buffers. T is the type of pointer that +// data() returns +template +class hostbuf_t +{ +public: + hostbuf_t() {} + // buffers are movable but not copyable + hostbuf_t(hostbuf_t&& other) + { + std::swap(buf, other.buf); + std::swap(bsize, other.bsize); + } + hostbuf_t& operator=(hostbuf_t&& other) + { + std::swap(buf, other.buf); + std::swap(bsize, other.bsize); + return *this; + } + hostbuf_t(const hostbuf_t&) = delete; + hostbuf_t& operator=(const hostbuf_t&) = delete; + + ~hostbuf_t() + { + free(); + } + + void alloc(size_t size) + { + bsize = size; + free(); + + // we're aligning to multiples of 64 bytes, so round the + // allocation size up to the nearest 64 to keep ASAN happy + if(size % 64) + { + size += 64 - size % 64; + } + + // FFTW requires aligned allocations to use faster SIMD instructions. + // If enabling hugepages, align to 2 MiB. Otherwise, aligning to + // 64 bytes is enough for AVX instructions up to AVX512. +#ifdef WIN32 + buf = _aligned_malloc(size, 64); +#else + // On Linux, ask for hugepages to reduce TLB pressure and + // improve performance. Allocations need to be aligned to + // the hugepage size, and rounded up to the next whole + // hugepage. + static const size_t TWO_MiB = 2 * 1024 * 1024; + if(size >= TWO_MiB) + { + size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB; + buf = aligned_alloc(TWO_MiB, rounded_size); + madvise(buf, rounded_size, MADV_HUGEPAGE); + } + else + buf = aligned_alloc(64, size); +#endif + } + + size_t size() const + { + return bsize; + } + + void free() + { + if(buf != nullptr) + { +#ifdef WIN32 + _aligned_free(buf); +#else + std::free(buf); +#endif + buf = nullptr; + bsize = 0; + } + } + + T* data() const + { + return static_cast(buf); + } + + // Copy method + hostbuf_t copy() const + { + hostbuf_t copy; + copy.alloc(bsize); + memcpy(copy.buf, buf, bsize); + return copy; + } + + // shrink the buffer to fit the new size + void shrink(size_t new_size) + { + if(new_size > bsize) + throw std::runtime_error("can't shrink hostbuf to larger size"); + // just pretend the buffer is now that size + bsize = new_size; + } + + // equality/bool tests + bool operator==(std::nullptr_t n) const + { + return buf == n; + } + bool operator!=(std::nullptr_t n) const + { + return buf != n; + } + operator bool() const + { + return buf; + } + +private: + // The host buffer + void* buf = nullptr; + size_t bsize = 0; +}; + +// default hostbuf that gives out void* pointers +typedef hostbuf_t<> hostbuf; +#endif diff --git a/shared/increment.h b/shared/increment.h new file mode 100644 index 0000000..90bba1d --- /dev/null +++ b/shared/increment.h @@ -0,0 +1,100 @@ +// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_INCREMENT_H +#define ROCFFT_INCREMENT_H + +#include +#include +#include + +// Helper functions to iterate over a buffer in row-major order. +// Indexes may be given as either a tuple or vector of sizes. They +// return true if the index was successfully incremented to move to +// the next element in the buffer. + +template +static bool increment_base(T1& index, const T2& length) +{ + static_assert(std::is_integral::value, "Integral required."); + static_assert(std::is_integral::value, "Integral required."); + if(index < length - 1) + { + ++index; + return true; + } + index = 0; + return false; +} + +// Increment the index (row-major) for looping over 1, 2, and 3 dimensions length. +template +static bool increment_rowmajor(T1& index, const T2& length) +{ + static_assert(std::is_integral::value, "Integral required."); + static_assert(std::is_integral::value, "Integral required."); + return increment_base(index, length); +} + +template +static bool increment_rowmajor(std::tuple& index, const std::tuple& length) +{ + if(increment_base(std::get<1>(index), std::get<1>(length))) + // we incremented ok, nothing further to do + return true; + // otherwise, we rolled over + return increment_base(std::get<0>(index), std::get<0>(length)); +} + +template +static bool increment_rowmajor(std::tuple& index, const std::tuple& length) +{ + if(increment_base(std::get<2>(index), std::get<2>(length))) + // we incremented ok, nothing further to do + return true; + if(increment_base(std::get<1>(index), std::get<1>(length))) + // we incremented ok, nothing further to do + return true; + // otherwise, we rolled over + return increment_base(std::get<0>(index), std::get<0>(length)); +} + +// Increment row-major index over arbitrary dimension length +template +bool increment_rowmajor(std::vector& index, const std::vector& length) +{ + for(int idim = length.size(); idim-- > 0;) + { + if(index[idim] < length[idim]) + { + if((++index[idim]) == length[idim]) + { + index[idim] = 0; + continue; + } + // we know we were able to increment something and didn't hit the end + return true; + } + } + // End the loop when we get back to the start: + return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; }); +} + +#endif diff --git a/shared/precision_type.h b/shared/precision_type.h new file mode 100644 index 0000000..526fc9a --- /dev/null +++ b/shared/precision_type.h @@ -0,0 +1,70 @@ +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_PRECISION_TYPE_H +#define ROCFFT_PRECISION_TYPE_H + +#include "array_predicate.h" +#include "rocfft/rocfft.h" + +static size_t real_type_size(rocfft_precision precision) +{ + switch(precision) + { + case rocfft_precision_half: + return 2; + case rocfft_precision_single: + return 4; + case rocfft_precision_double: + return 8; + } +} + +static size_t complex_type_size(rocfft_precision precision) +{ + return real_type_size(precision) * 2; +} + +static const char* precision_name(rocfft_precision precision) +{ + switch(precision) + { + case rocfft_precision_half: + return "half"; + case rocfft_precision_single: + return "single"; + case rocfft_precision_double: + return "double"; + } +} + +static size_t element_size(rocfft_precision precision, rocfft_array_type array_type) +{ + return array_type_is_complex(array_type) ? complex_type_size(precision) + : real_type_size(precision); +} + +// offset a pointer by a number of elements, given the elements' +// precision and type (complex or not) +static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type) +{ + return static_cast(p) + elems * element_size(precision, type); +} +#endif diff --git a/shared/printbuffer.h b/shared/printbuffer.h new file mode 100644 index 0000000..5ae0b64 --- /dev/null +++ b/shared/printbuffer.h @@ -0,0 +1,108 @@ +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef PRINTBUFFER_H +#define PRINTBUFFER_H + +#include "hostbuf.h" +#include "increment.h" +#include +#include + +// Output a formatted general-dimensional array with given length and stride in batches +// separated by dist. +template +inline void printbuffer(const Toutput* output, + const std::vector& length, + const std::vector& stride, + const Tsize nbatch, + const Tsize dist, + const size_t offset, + Tstream& stream) +{ + auto i_base = 0; + for(unsigned int b = 0; b < nbatch; b++, i_base += dist) + { + std::vector index(length.size()); + std::fill(index.begin(), index.end(), 0); + do + { + const int i + = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset); + stream << output[i] << " "; + for(int li = index.size(); li-- > 0;) + { + if(index[li] == (length[li] - 1)) + { + stream << "\n"; + } + else + { + break; + } + } + } while(increment_rowmajor(index, length)); + stream << std::endl; + } +} + +template +class buffer_printer +{ + // The scalar versions might be part of a planar format. +public: + template + static void print_buffer(const std::vector& buf, + const std::vector& length, + const std::vector& stride, + const Tsize nbatch, + const Tsize dist, + const std::vector& offset, + Tstream& stream = std::cout) + { + for(const auto& vec : buf) + { + printbuffer(reinterpret_cast(vec.data()), + length, + stride, + nbatch, + dist, + offset[0], + stream); + } + }; + template + static void print_buffer_flat(const std::vector& buf, + const std::vector& size, + const std::vector& offset, + Tstream& stream = std::cout) + { + for(const auto& vec : buf) + { + auto data = reinterpret_cast(vec.data()); + stream << "idx " << 0; + for(size_t i = 0; i < size[0]; ++i) + stream << " " << data[i]; + stream << std::endl; + } + }; +}; + +#endif diff --git a/shared/ptrdiff.h b/shared/ptrdiff.h new file mode 100644 index 0000000..3bd15de --- /dev/null +++ b/shared/ptrdiff.h @@ -0,0 +1,40 @@ +// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +// Compute the farthest point from the original pointer. +static size_t compute_ptrdiff(const std::vector& length, + const std::vector& stride, + const size_t nbatch, + const size_t dist) +{ + size_t val = 0; + if(!length.empty()) + { + val = 1; + for(unsigned int i = 0; i < length.size(); ++i) + { + val += (length[i] - 1) * stride[i]; + } + val += (nbatch - 1) * dist; + } + return val; +} diff --git a/shared/rocfft_accuracy_test.h b/shared/rocfft_accuracy_test.h new file mode 100644 index 0000000..4ce3059 --- /dev/null +++ b/shared/rocfft_accuracy_test.h @@ -0,0 +1,29 @@ +// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_ACCURACY_TEST +#define ROCFFT_ACCURACY_TEST + +#include "accuracy_test.h" +#include "rocfft_params.h" + +void fft_vs_reference(rocfft_params& params, bool round_trip = false); + +#endif diff --git a/shared/rocfft_against_fftw.h b/shared/rocfft_against_fftw.h new file mode 100644 index 0000000..d03754c --- /dev/null +++ b/shared/rocfft_against_fftw.h @@ -0,0 +1,231 @@ +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once +#ifndef ROCFFT_AGAINST_FFTW +#define ROCFFT_AGAINST_FFTW + +#include +#include +#include +#include + +#include "fftw_transform.h" + +// Return the precision enum for rocFFT based upon the type. +template +inline fft_precision precision_selector(); +template <> +inline fft_precision precision_selector() +{ + return fft_precision_single; +} +template <> +inline fft_precision precision_selector() +{ + return fft_precision_double; +} + +extern bool use_fftw_wisdom; + +// construct and return an FFTW plan with the specified type, +// precision, and dimensions. cpu_out is required if we're using +// wisdom, which runs actual FFTs to work out the best plan. +template +static typename fftw_trait::fftw_plan_type + fftw_plan_with_precision(const std::vector& dims, + const std::vector& howmany_dims, + const fft_transform_type transformType, + const size_t isize, + void* cpu_in, + void* cpu_out) +{ + using fftw_complex_type = typename fftw_trait::fftw_complex_type; + + // NB: Using FFTW_MEASURE implies that the input buffer's data + // may be destroyed during plan creation. But if we're wanting + // to run FFTW in the first place, we must have just created an + // uninitialized input buffer anyway. + + switch(transformType) + { + case fft_transform_type_complex_forward: + return fftw_plan_guru64_dft(dims.size(), + dims.data(), + howmany_dims.size(), + howmany_dims.data(), + reinterpret_cast(cpu_in), + reinterpret_cast(cpu_out), + -1, + use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); + case fft_transform_type_complex_inverse: + return fftw_plan_guru64_dft(dims.size(), + dims.data(), + howmany_dims.size(), + howmany_dims.data(), + reinterpret_cast(cpu_in), + reinterpret_cast(cpu_out), + 1, + use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); + case fft_transform_type_real_forward: + return fftw_plan_guru64_r2c(dims.size(), + dims.data(), + howmany_dims.size(), + howmany_dims.data(), + reinterpret_cast(cpu_in), + reinterpret_cast(cpu_out), + use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); + case fft_transform_type_real_inverse: + return fftw_plan_guru64_c2r(dims.size(), + dims.data(), + howmany_dims.size(), + howmany_dims.data(), + reinterpret_cast(cpu_in), + reinterpret_cast(cpu_out), + use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); + default: + throw std::runtime_error("Invalid transform type"); + } +} + +// construct an FFTW plan, given rocFFT parameters. output is +// required if planning with wisdom. +template +static typename fftw_trait::fftw_plan_type + fftw_plan_via_rocfft(const std::vector& length, + const std::vector& istride, + const std::vector& ostride, + const size_t nbatch, + const size_t idist, + const size_t odist, + const fft_transform_type transformType, + std::vector& input, + std::vector& output) +{ + // Dimension configuration: + std::vector dims(length.size()); + for(unsigned int idx = 0; idx < length.size(); ++idx) + { + dims[idx].n = length[idx]; + dims[idx].is = istride[idx]; + dims[idx].os = ostride[idx]; + } + + // Batch configuration: + std::vector howmany_dims(1); + howmany_dims[0].n = nbatch; + howmany_dims[0].is = idist; + howmany_dims[0].os = odist; + + return fftw_plan_with_precision(dims, + howmany_dims, + transformType, + idist * nbatch, + input.front().data(), + output.empty() ? nullptr : output.front().data()); +} + +template +void fftw_run(fft_transform_type transformType, + typename fftw_trait::fftw_plan_type cpu_plan, + std::vector& cpu_in, + std::vector& cpu_out) +{ + switch(transformType) + { + case fft_transform_type_complex_forward: + { + fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); + break; + } + case fft_transform_type_complex_inverse: + { + fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); + break; + } + case fft_transform_type_real_forward: + { + fftw_plan_execute_r2c(cpu_plan, cpu_in, cpu_out); + break; + } + case fft_transform_type_real_inverse: + { + fftw_plan_execute_c2r(cpu_plan, cpu_in, cpu_out); + break; + } + } +} + +// Given a transform type, return the contiguous input type. +inline fft_array_type contiguous_itype(const fft_transform_type transformType) +{ + switch(transformType) + { + case fft_transform_type_complex_forward: + case fft_transform_type_complex_inverse: + return fft_array_type_complex_interleaved; + case fft_transform_type_real_forward: + return fft_array_type_real; + case fft_transform_type_real_inverse: + return fft_array_type_hermitian_interleaved; + default: + throw std::runtime_error("Invalid transform type"); + } + return fft_array_type_complex_interleaved; +} + +// Given a transform type, return the contiguous output type. +inline fft_array_type contiguous_otype(const fft_transform_type transformType) +{ + switch(transformType) + { + case fft_transform_type_complex_forward: + case fft_transform_type_complex_inverse: + return fft_array_type_complex_interleaved; + case fft_transform_type_real_forward: + return fft_array_type_hermitian_interleaved; + case fft_transform_type_real_inverse: + return fft_array_type_real; + default: + throw std::runtime_error("Invalid transform type"); + } + return fft_array_type_complex_interleaved; +} + +// Given a precision, return the acceptable tolerance. +inline double type_epsilon(const fft_precision precision) +{ + switch(precision) + { + case fft_precision_half: + return type_epsilon<_Float16>(); + break; + case fft_precision_single: + return type_epsilon(); + break; + case fft_precision_double: + return type_epsilon(); + break; + default: + throw std::runtime_error("Invalid precision"); + } +} + +#endif diff --git a/shared/rocfft_complex.h b/shared/rocfft_complex.h new file mode 100644 index 0000000..efa0290 --- /dev/null +++ b/shared/rocfft_complex.h @@ -0,0 +1,346 @@ +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_COMPLEX_H +#define ROCFFT_COMPLEX_H + +#include +#if !defined(__HIPCC_RTC__) +#include +#endif +#include +#include + +#ifdef __HIP_PLATFORM_NVIDIA__ +typedef __half _Float16; +#endif + +template +struct rocfft_complex +{ + + Treal x; // Real part + Treal y; // Imaginary part + + // Constructors + // Do not initialize the members x or y by default, to ensure that it can + // be used in __shared__ and that it is a trivial class compatible with C. + __device__ __host__ rocfft_complex() = default; + __device__ __host__ rocfft_complex(const rocfft_complex&) = default; + __device__ __host__ rocfft_complex(rocfft_complex&&) = default; + __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default; + __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default; + __device__ __host__ ~rocfft_complex() = default; + + // Constructor from real and imaginary parts + __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag) + : x{real} + , y{imag} + { + } + + // Conversion from different precision + template + __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex& z) + : x(z.x) + , y(z.y) + { + } + + // Accessors + __device__ __host__ constexpr Treal real() const + { + return x; + } + + __device__ __host__ constexpr Treal imag() const + { + return y; + } + + // Unary operations + __forceinline__ __device__ __host__ rocfft_complex operator-() const + { + return {-x, -y}; + } + + __forceinline__ __device__ __host__ rocfft_complex operator+() const + { + return *this; + } + + __device__ __host__ Treal asum(const rocfft_complex& z) + { + return abs(z.x) + abs(z.y); + } + + // Internal real functions + static __forceinline__ __device__ __host__ Treal abs(Treal x) + { + return x < 0 ? -x : x; + } + + static __forceinline__ __device__ __host__ float sqrt(float x) + { + return ::sqrtf(x); + } + + static __forceinline__ __device__ __host__ double sqrt(double x) + { + return ::sqrt(x); + } + + // Addition operators + __device__ __host__ auto& operator+=(const rocfft_complex& rhs) + { + return *this = {x + rhs.x, y + rhs.y}; + } + + __device__ __host__ auto operator+(const rocfft_complex& rhs) const + { + auto lhs = *this; + return lhs += rhs; + } + + // Subtraction operators + __device__ __host__ auto& operator-=(const rocfft_complex& rhs) + { + return *this = {x - rhs.x, y - rhs.y}; + } + + __device__ __host__ auto operator-(const rocfft_complex& rhs) const + { + auto lhs = *this; + return lhs -= rhs; + } + + // Multiplication operators + __device__ __host__ auto& operator*=(const rocfft_complex& rhs) + { + return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y}; + } + + __device__ __host__ auto operator*(const rocfft_complex& rhs) const + { + auto lhs = *this; + return lhs *= rhs; + } + + // Division operators + __device__ __host__ auto& operator/=(const rocfft_complex& rhs) + { + // Form of Robert L. Smith's Algorithm 116 + if(abs(rhs.x) > abs(rhs.y)) + { + Treal ratio = rhs.y / rhs.x; + Treal scale = 1 / (rhs.x + rhs.y * ratio); + *this = {(x + y * ratio) * scale, (y - x * ratio) * scale}; + } + else + { + Treal ratio = rhs.x / rhs.y; + Treal scale = 1 / (rhs.x * ratio + rhs.y); + *this = {(y + x * ratio) * scale, (y * ratio - x) * scale}; + } + return *this; + } + + __device__ __host__ auto operator/(const rocfft_complex& rhs) const + { + auto lhs = *this; + return lhs /= rhs; + } + + // Comparison operators + __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const + { + return x == rhs.x && y == rhs.y; + } + + __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const + { + return !(*this == rhs); + } + + // Operators for complex-real computations + template + __device__ __host__ auto& operator+=(const U& rhs) + { + return (x += Treal(rhs)), *this; + } + + template + __device__ __host__ auto& operator-=(const U& rhs) + { + return (x -= Treal(rhs)), *this; + } + + __device__ __host__ auto operator+(const Treal& rhs) + { + auto lhs = *this; + return lhs += rhs; + } + + __device__ __host__ auto operator-(const Treal& rhs) + { + auto lhs = *this; + return lhs -= rhs; + } + + template + __device__ __host__ auto& operator*=(const U& rhs) + { + return (x *= Treal(rhs)), (y *= Treal(rhs)), *this; + } + + template + __device__ __host__ auto operator*(const U& rhs) const + { + auto lhs = *this; + return lhs *= Treal(rhs); + } + + template + __device__ __host__ auto& operator/=(const U& rhs) + { + return (x /= Treal(rhs)), (y /= Treal(rhs)), *this; + } + + template + __device__ __host__ auto operator/(const U& rhs) const + { + auto lhs = *this; + return lhs /= Treal(rhs); + } + + template + __device__ __host__ constexpr bool operator==(const U& rhs) const + { + return x == Treal(rhs) && y == 0; + } + + template + __device__ __host__ constexpr bool operator!=(const U& rhs) const + { + return !(*this == rhs); + } +}; + +// Stream operators +#if !defined(__HIPCC_RTC__) +static std::ostream& operator<<(std::ostream& stream, const _Float16& f) +{ + return stream << static_cast(f); +} + +template +std::ostream& operator<<(std::ostream& out, const rocfft_complex& z) +{ + return out << '(' << static_cast(z.x) << ',' << static_cast(z.y) << ')'; +} +#endif + +// Operators for real-complex computations +template +__device__ __host__ rocfft_complex operator+(const U& lhs, const rocfft_complex& rhs) +{ + return {Treal(lhs) + rhs.x, rhs.y}; +} + +template +__device__ __host__ rocfft_complex operator-(const U& lhs, const rocfft_complex& rhs) +{ + return {Treal(lhs) - rhs.x, -rhs.y}; +} + +template +__device__ __host__ rocfft_complex operator*(const U& lhs, const rocfft_complex& rhs) +{ + return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y}; +} + +template +__device__ __host__ rocfft_complex operator/(const U& lhs, const rocfft_complex& rhs) +{ + // Form of Robert L. Smith's Algorithm 116 + if(rocfft_complex::abs(rhs.x) > rocfft_complex::abs(rhs.y)) + { + Treal ratio = rhs.y / rhs.x; + Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio); + return {scale, -scale * ratio}; + } + else + { + Treal ratio = rhs.x / rhs.y; + Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y); + return {ratio * scale, -scale}; + } +} + +template +__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex& rhs) +{ + return Treal(lhs) == rhs.x && 0 == rhs.y; +} + +template +__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex& rhs) +{ + return !(lhs == rhs); +} + +// Extending std namespace to handle rocfft_complex datatype +namespace std +{ + template + __device__ __host__ constexpr Treal real(const rocfft_complex& z) + { + return z.x; + } + + template + __device__ __host__ constexpr Treal imag(const rocfft_complex& z) + { + return z.y; + } + + template + __device__ __host__ constexpr rocfft_complex conj(const rocfft_complex& z) + { + return {z.x, -z.y}; + } + + template + __device__ __host__ inline Treal norm(const rocfft_complex& z) + { + return (z.x * z.x) + (z.y * z.y); + } + + template + __device__ __host__ inline Treal abs(const rocfft_complex& z) + { + Treal tr = rocfft_complex::abs(z.x), ti = rocfft_complex::abs(z.y); + return tr > ti ? (ti /= tr, tr * rocfft_complex::sqrt(ti * ti + 1)) + : ti ? (tr /= ti, ti * rocfft_complex::sqrt(tr * tr + 1)) + : 0; + } +} + +#endif // ROCFFT_COMPLEX_H diff --git a/shared/rocfft_hip.h b/shared/rocfft_hip.h new file mode 100644 index 0000000..e086cab --- /dev/null +++ b/shared/rocfft_hip.h @@ -0,0 +1,52 @@ +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef __ROCFFT_HIP_H__ +#define __ROCFFT_HIP_H__ + +#include +#include + +class rocfft_scoped_device +{ +public: + rocfft_scoped_device(int device) + { + if(hipGetDevice(&orig_device) != hipSuccess) + throw std::runtime_error("hipGetDevice failure"); + + if(hipSetDevice(device) != hipSuccess) + throw std::runtime_error("hipSetDevice failure"); + } + ~rocfft_scoped_device() + { + (void)hipSetDevice(orig_device); + } + + // not copyable or movable + rocfft_scoped_device(const rocfft_scoped_device&) = delete; + rocfft_scoped_device(rocfft_scoped_device&&) = delete; + rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete; + +private: + int orig_device; +}; + +#endif // __ROCFFT_HIP_H__ diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h new file mode 100644 index 0000000..bf9b728 --- /dev/null +++ b/shared/rocfft_params.h @@ -0,0 +1,585 @@ +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCFFT_PARAMS_H +#define ROCFFT_PARAMS_H + +#include "../shared/fft_params.h" +#include "../shared/gpubuf.h" +#include "rocfft/rocfft.h" + +// Return the string of the rocfft_status code +static std::string rocfft_status_to_string(const rocfft_status ret) +{ + switch(ret) + { + case rocfft_status_success: + return "rocfft_status_success"; + case rocfft_status_failure: + return "rocfft_status_failure"; + case rocfft_status_invalid_arg_value: + return "rocfft_status_invalid_arg_value"; + case rocfft_status_invalid_dimensions: + return "rocfft_status_invalid_dimensions"; + case rocfft_status_invalid_array_type: + return "rocfft_status_invalid_array_type"; + case rocfft_status_invalid_strides: + return "rocfft_status_invalid_strides"; + case rocfft_status_invalid_distance: + return "rocfft_status_invalid_distance"; + case rocfft_status_invalid_offset: + return "rocfft_status_invalid_offset"; + case rocfft_status_invalid_work_buffer: + return "rocfft_status_invalid_work_buffer"; + default: + throw std::runtime_error("unknown rocfft_status"); + } +} + +inline fft_status fft_status_from_rocfftparams(const rocfft_status val) +{ + switch(val) + { + case rocfft_status_success: + return fft_status_success; + case rocfft_status_failure: + return fft_status_failure; + case rocfft_status_invalid_arg_value: + return fft_status_invalid_arg_value; + case rocfft_status_invalid_dimensions: + return fft_status_invalid_dimensions; + case rocfft_status_invalid_array_type: + return fft_status_invalid_array_type; + case rocfft_status_invalid_strides: + return fft_status_invalid_strides; + case rocfft_status_invalid_distance: + return fft_status_invalid_distance; + case rocfft_status_invalid_offset: + return fft_status_invalid_offset; + case rocfft_status_invalid_work_buffer: + return fft_status_invalid_work_buffer; + default: + throw std::runtime_error("Invalid status"); + } +} + +inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val) +{ + switch(val) + { + case fft_precision_single: + return rocfft_precision_single; + case fft_precision_double: + return rocfft_precision_double; + case fft_precision_half: + return rocfft_precision_half; + default: + throw std::runtime_error("Invalid precision"); + } +} + +inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val) +{ + switch(val) + { + case fft_array_type_complex_interleaved: + return rocfft_array_type_complex_interleaved; + case fft_array_type_complex_planar: + return rocfft_array_type_complex_planar; + case fft_array_type_real: + return rocfft_array_type_real; + case fft_array_type_hermitian_interleaved: + return rocfft_array_type_hermitian_interleaved; + case fft_array_type_hermitian_planar: + return rocfft_array_type_hermitian_planar; + case fft_array_type_unset: + return rocfft_array_type_unset; + } + return rocfft_array_type_unset; +} + +inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val) +{ + switch(val) + { + case fft_transform_type_complex_forward: + return rocfft_transform_type_complex_forward; + case fft_transform_type_complex_inverse: + return rocfft_transform_type_complex_inverse; + case fft_transform_type_real_forward: + return rocfft_transform_type_real_forward; + case fft_transform_type_real_inverse: + return rocfft_transform_type_real_inverse; + default: + throw std::runtime_error("Invalid transform type"); + } +} + +inline rocfft_result_placement + rocfft_result_placement_from_fftparams(const fft_result_placement val) +{ + switch(val) + { + case fft_placement_inplace: + return rocfft_placement_inplace; + case fft_placement_notinplace: + return rocfft_placement_notinplace; + default: + throw std::runtime_error("Invalid result placement"); + } +} + +class rocfft_params : public fft_params +{ +public: + rocfft_plan plan = nullptr; + rocfft_execution_info info = nullptr; + rocfft_plan_description desc = nullptr; + gpubuf_t wbuffer; + + explicit rocfft_params(){}; + + explicit rocfft_params(const fft_params& p) + : fft_params(p){}; + + rocfft_params(const rocfft_params&) = delete; + rocfft_params& operator=(const rocfft_params&) = delete; + + ~rocfft_params() + { + free(); + }; + + void free() + { + if(plan != nullptr) + { + rocfft_plan_destroy(plan); + plan = nullptr; + } + if(info != nullptr) + { + rocfft_execution_info_destroy(info); + info = nullptr; + } + if(desc != nullptr) + { + rocfft_plan_description_destroy(desc); + desc = nullptr; + } + wbuffer.free(); + } + + void validate_fields() const override + { + // row-major lengths including batch (i.e. batch is at the front) + std::vector length_with_batch{nbatch}; + std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch)); + + auto validate_field = [&](const fft_field& f) { + for(const auto& b : f.bricks) + { + // bricks must have same dim as FFT, including batch + if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1 + || b.stride.size() != length.size() + 1) + throw std::runtime_error( + "brick dimension does not match FFT + batch dimension"); + + // ensure lower < upper, and that both fit in the FFT + batch dims + if(!std::lexicographical_compare( + b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end())) + throw std::runtime_error("brick lower index is not less than upper index"); + + if(!std::lexicographical_compare(b.lower.begin(), + b.lower.end(), + length_with_batch.begin(), + length_with_batch.end())) + throw std::runtime_error( + "brick lower index is not less than FFT + batch length"); + + if(!std::lexicographical_compare(b.upper.begin(), + b.upper.end(), + length_with_batch.begin(), + length_with_batch.end()) + && b.upper != length_with_batch) + throw std::runtime_error("brick upper index is not <= FFT + batch length"); + } + }; + + for(const auto& ifield : ifields) + validate_field(ifield); + for(const auto& ofield : ofields) + validate_field(ofield); + } + + rocfft_precision get_rocfft_precision() + { + return rocfft_precision_from_fftparams(precision); + } + + size_t vram_footprint() override + { + size_t val = fft_params::vram_footprint(); + if(setup_structs() != fft_status_success) + { + throw std::runtime_error("Struct setup failed"); + } + val += workbuffersize; + + return val; + } + + // Convert the generic fft_field structure to a rocfft_field + // structure that can be passed to rocFFT. In particular, we need + // to convert from row-major to column-major. + static rocfft_field fft_field_to_rocfft_field(const fft_field& f) + { + rocfft_field rfield = nullptr; + if(f.bricks.empty()) + return rfield; + + if(rocfft_field_create(&rfield) != rocfft_status_success) + throw std::runtime_error("rocfft_field_create failed"); + for(const auto& b : f.bricks) + { + // rocFFT wants column-major bricks and fft_params stores + // row-major + std::vector lower_cm; + std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm)); + std::vector upper_cm; + std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm)); + std::vector stride_cm; + std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm)); + + rocfft_brick rbrick = nullptr; + if(rocfft_brick_create(&rbrick, + lower_cm.data(), // field_lower + upper_cm.data(), // field_upper + stride_cm.data(), // brick_stride + lower_cm.size(), // dim + b.device) // deviceID + != rocfft_status_success) + throw std::runtime_error("rocfft_brick_create failed"); + + if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success) + throw std::runtime_error("rocfft_field_add_brick failed"); + + rocfft_brick_destroy(rbrick); + } + return rfield; + } + + fft_status setup_structs() + { + rocfft_status fft_status = rocfft_status_success; + if(desc == nullptr) + { + rocfft_plan_description_create(&desc); + if(fft_status != rocfft_status_success) + return fft_status_from_rocfftparams(fft_status); + + fft_status + = rocfft_plan_description_set_data_layout(desc, + rocfft_array_type_from_fftparams(itype), + rocfft_array_type_from_fftparams(otype), + ioffset.data(), + ooffset.data(), + istride_cm().size(), + istride_cm().data(), + idist, + ostride_cm().size(), + ostride_cm().data(), + odist); + if(fft_status != rocfft_status_success) + { + throw std::runtime_error("rocfft_plan_description_set_data_layout failed"); + } + + if(scale_factor != 1.0) + { + fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor); + if(fft_status != rocfft_status_success) + { + throw std::runtime_error("rocfft_plan_description_set_scale_factor failed"); + } + } + + for(const auto& ifield : ifields) + { + rocfft_field infield = fft_field_to_rocfft_field(ifield); + if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success) + throw std::runtime_error("rocfft_description_add_infield failed"); + rocfft_field_destroy(infield); + } + + for(const auto& ofield : ofields) + { + rocfft_field outfield = fft_field_to_rocfft_field(ofield); + if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success) + throw std::runtime_error("rocfft_description_add_outfield failed"); + rocfft_field_destroy(outfield); + } + } + + if(plan == nullptr) + { + fft_status = rocfft_plan_create(&plan, + rocfft_result_placement_from_fftparams(placement), + rocfft_transform_type_from_fftparams(transform_type), + get_rocfft_precision(), + length_cm().size(), + length_cm().data(), + nbatch, + desc); + if(fft_status != rocfft_status_success) + { + throw std::runtime_error("rocfft_plan_create failed"); + } + } + + if(info == nullptr) + { + fft_status = rocfft_execution_info_create(&info); + if(fft_status != rocfft_status_success) + { + throw std::runtime_error("rocfft_execution_info_create failed"); + } + } + + fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize); + if(fft_status != rocfft_status_success) + { + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed"); + } + + return fft_status_from_rocfftparams(fft_status); + } + + fft_status create_plan() override + { + fft_status ret = setup_structs(); + if(ret != fft_status_success) + { + return ret; + } + if(workbuffersize > 0) + { + hipError_t hip_status = hipSuccess; + hip_status = wbuffer.alloc(workbuffersize); + if(hip_status != hipSuccess) + { + std::ostringstream oss; + oss << "work buffer allocation failed (" << workbuffersize << " requested)"; + size_t mem_free = 0; + size_t mem_total = 0; + hip_status = hipMemGetInfo(&mem_free, &mem_total); + if(hip_status == hipSuccess) + { + oss << "free vram: " << mem_free << " total vram: " << mem_total; + } + else + { + oss << "hipMemGetInfo also failed"; + } + throw work_buffer_alloc_failure(oss.str()); + } + + auto rocret + = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize); + if(rocret != rocfft_status_success) + { + throw std::runtime_error("rocfft_execution_info_set_work_buffer failed"); + } + } + + return ret; + } + + fft_status set_callbacks(void* load_cb_host, + void* load_cb_data, + void* store_cb_host, + void* store_cb_data) override + { + if(run_callbacks) + { + auto roc_status + = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0); + if(roc_status != rocfft_status_success) + return fft_status_from_rocfftparams(roc_status); + + roc_status + = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0); + if(roc_status != rocfft_status_success) + return fft_status_from_rocfftparams(roc_status); + } + return fft_status_success; + } + + fft_status execute(void** in, void** out) override + { + auto ret = rocfft_execute(plan, in, out, info); + return fft_status_from_rocfftparams(ret); + } + + // scatter data to multiple GPUs and adjust I/O buffers to match + void multi_gpu_prepare(std::vector& ibuffer, + std::vector& pibuffer, + std::vector& pobuffer) override + { + auto alloc_fields = [&](const fft_params::fft_field& field, + fft_array_type array_type, + std::vector& pbuffer, + bool copy_input) { + if(field.bricks.empty()) + return; + + // we have a field defined, clear the list of buffers as + // we'll be allocating new ones for each brick + pbuffer.clear(); + + for(const auto& b : field.bricks) + { + // get brick's length - note that this includes batch + // dimension + const auto brick_len = b.length(); + const auto brick_stride = b.stride; + + const size_t brick_size_elems = product(brick_len.begin(), brick_len.end()); + const size_t elem_size_bytes = var_size(precision, array_type); + const size_t brick_size_bytes = brick_size_elems * elem_size_bytes; + + // set device for the alloc, but we want to return to the + // default device as the source of a following memcpy + { + rocfft_scoped_device dev(b.device); + multi_gpu_data.emplace_back(); + if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess) + throw std::runtime_error("device allocation failure"); + pbuffer.push_back(multi_gpu_data.back().data()); + } + + if(copy_input) + { + // For now, assume we're only splitting on highest FFT + // dimension, lower-dimensional FFT data is all + // contiguous, and batches are contiguous in each brick. + // + // That means we can express this as a 2D memcpy. + const size_t unbatched_elems_per_brick + = product(brick_len.begin() + 1, brick_len.end()); + const size_t unbatched_elems_per_fft = product(length.begin(), length.end()); + + // get this brick's starting offset in the field + const size_t brick_offset + = b.lower_field_offset(istride, idist) * elem_size_bytes; + + // copy from original input - note that we're + // assuming interleaved data so ibuffer has only one + // gpubuf + if(hipMemcpy2D(pbuffer.back(), + unbatched_elems_per_brick * elem_size_bytes, + ibuffer.front().data_offset(brick_offset), + unbatched_elems_per_fft * elem_size_bytes, + unbatched_elems_per_brick * elem_size_bytes, + brick_len.front(), + hipMemcpyHostToDevice) + != hipSuccess) + throw std::runtime_error("hipMemcpy failure"); + } + } + + // if we copied the input to all the other devices, and + // this is an out-of-place transform, we no longer + // need the original input + if(copy_input && placement == fft_placement_notinplace) + ibuffer.clear(); + }; + + // assume one input, one output field for simple cases + if(!ifields.empty()) + alloc_fields(ifields.front(), itype, pibuffer, true); + if(!ofields.empty()) + { + if(!ifields.empty() && placement == fft_placement_inplace) + pobuffer = pibuffer; + else + alloc_fields(ofields.front(), otype, pobuffer, false); + } + } + + // when preparing for multi-GPU transform, we need to allocate data + // on each GPU. This vector remembers all of those allocations. + std::vector multi_gpu_data; + + // gather data after multi-GPU FFT for verification + void multi_gpu_finalize(std::vector& obuffer, std::vector& pobuffer) override + { + if(ofields.empty()) + return; + + for(size_t i = 0; i < ofields.front().bricks.size(); ++i) + { + const auto& b = ofields.front().bricks[i]; + const auto& brick_ptr = pobuffer[i]; + + const auto brick_len = b.length(); + + const size_t elem_size_bytes = var_size(precision, otype); + + // get this brick's starting offset in the field + const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes; + + // switch device to where we're copying from + rocfft_scoped_device dev(b.device); + + // For now, assume we're only splitting on highest FFT + // dimension, lower-dimensional FFT data is all + // contiguous, and batches are contiguous in each brick. + // + // That means we can express this as a 2D memcpy. + const size_t unbatched_elems_per_brick + = product(brick_len.begin() + 1, brick_len.end()); + const auto output_length = olength(); + const size_t unbatched_elems_per_fft + = product(output_length.begin(), output_length.end()); + + // copy to original output buffer - note that + // we're assuming interleaved data so obuffer + // has only one gpubuf + if(hipMemcpy2D(obuffer.front().data_offset(brick_offset), + unbatched_elems_per_fft * elem_size_bytes, + brick_ptr, + unbatched_elems_per_brick * elem_size_bytes, + unbatched_elems_per_brick * elem_size_bytes, + brick_len.front(), + hipMemcpyDeviceToDevice) + != hipSuccess) + throw std::runtime_error("hipMemcpy failure"); + + // device-to-device transfers don't synchronize with the + // host, add explicit sync + (void)hipDeviceSynchronize(); + } + pobuffer.clear(); + pobuffer.push_back(obuffer.front().data()); + } +}; + +#endif diff --git a/shared/test_params.h b/shared/test_params.h new file mode 100644 index 0000000..8d8f6f7 --- /dev/null +++ b/shared/test_params.h @@ -0,0 +1,51 @@ +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once +#ifndef TESTCONSTANTS_H +#define TESTCONSTANTS_H + +#include + +extern int verbose; +extern size_t ramgb; +extern size_t vramgb; + +extern size_t n_random_tests; + +extern size_t random_seed; +extern double planar_prob; +extern double callback_prob; + +extern double half_epsilon; +extern double single_epsilon; +extern double double_epsilon; +extern bool skip_runtime_fails; + +extern double max_linf_eps_double; +extern double max_l2_eps_double; +extern double max_linf_eps_single; +extern double max_l2_eps_single; +extern double max_linf_eps_half; +extern double max_l2_eps_half; + +extern int n_hip_failures; + +#endif diff --git a/shared/work_queue.h b/shared/work_queue.h new file mode 100644 index 0000000..e13fc41 --- /dev/null +++ b/shared/work_queue.h @@ -0,0 +1,49 @@ +// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include +#include +#include +template +struct WorkQueue +{ + void push(_WorkItem&& i) + { + std::unique_lock lock(queueMutex); + items.emplace(std::move(i)); + emptyWait.notify_all(); + } + _WorkItem pop() + { + std::unique_lock lock(queueMutex); + while(items.empty()) + emptyWait.wait(lock); + _WorkItem item(items.front()); + items.pop(); + return item; + } + +private: + std::queue<_WorkItem> items; + std::mutex queueMutex; + std::condition_variable emptyWait; +};