From 2b4e03bf00d76badb3e06ba5b1a22b381aeb0bdb Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 7 Oct 2024 08:26:22 -0600 Subject: [PATCH] Batched - Dense Serial QR: fixing issue with work array We did not pass the stride of the work array to internal routines and we are not enforcing a contiguous memory allocation either so when using subviews to pass the work array, we run into troubles. Signed-off-by: Luc Berger-Vergiat --- ...tched_ApplyHouseholder_Serial_Internal.hpp | 21 +- .../impl/KokkosBatched_ApplyQ_Serial_Impl.hpp | 6 +- .../KokkosBatched_ApplyQ_Serial_Internal.hpp | 13 +- ...KokkosBatched_QR_FormQ_Serial_Internal.hpp | 6 +- .../impl/KokkosBatched_QR_Serial_Impl.hpp | 2 +- .../impl/KokkosBatched_QR_Serial_Internal.hpp | 4 +- .../KokkosBatched_SVD_Serial_Internal.hpp | 11 +- .../dense/src/KokkosBatched_ApplyQ_Decl.hpp | 6 +- .../dense/unit_test/Test_Batched_SerialQR.hpp | 224 +++++++++++------- 9 files changed, 179 insertions(+), 114 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index 574280c82b..33b59b81b2 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -34,8 +34,8 @@ struct SerialApplyLeftHouseholderInternal { /* */ ValueType* u2, const int u2s, /* */ ValueType* a1t, const int a1ts, /* */ ValueType* A2, const int as0, const int as1, - /* */ ValueType* w1t) { - typedef ValueType value_type; + /* */ ValueType* w1t, const int ws0) { + using value_type = ValueType; /// u2 m x 1 /// a1t 1 x n @@ -54,15 +54,15 @@ struct SerialApplyLeftHouseholderInternal { for (int j = 0; j < n; ++j) { value_type tmp = a1t[j * a1ts]; for (int i = 0; i < m; ++i) tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; - w1t[j] = tmp * inv_tau; // /= (*tau); + w1t[j * ws0] = tmp * inv_tau; // /= (*tau); } // a1t -= w1t (axpy) - for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j]; + for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j * ws0]; // A2 -= u2 w1t (ger) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; + for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j * ws0]; return 0; } @@ -74,8 +74,8 @@ struct SerialApplyRightHouseholderInternal { /* */ ValueType* u2, const int u2s, /* */ ValueType* a1, const int a1s, /* */ ValueType* A2, const int as0, const int as1, - /* */ ValueType* w1) { - typedef ValueType value_type; + /* */ ValueType* w1, const int ws0) { + using value_type = ValueType; /// u2 n x 1 /// a1 m x 1 /// A2 m x n @@ -93,15 +93,16 @@ struct SerialApplyRightHouseholderInternal { for (int i = 0; i < m; ++i) { value_type tmp = a1[i * a1s]; for (int j = 0; j < n; ++j) tmp += A2[i * as0 + j * as1] * u2[j * u2s]; - w1[i] = tmp * inv_tau; // \= (*tau); + w1[i * ws0] = tmp * inv_tau; // \= (*tau); } // a1 -= w1 (axpy) - for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i]; + for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i * ws0]; // A2 -= w1 * u2' (ger with conjugate) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + for (int i = 0; i < m; ++i) + A2[i * as0 + j * as1] -= w1[i * ws0] * Kokkos::ArithTraits::conj(u2[j * u2s]); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp index f1f2d29089..02335f319f 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp @@ -33,7 +33,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ @@ -42,7 +42,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ @@ -51,7 +51,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ KOKKOS_INLINE_FUNCTION int SerialQR::invoke(const AViewType &A, const tViewType &t, const wViewType &w) { return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(), - t.stride_0(), w.data()); + t.stride_0(), w.data(), w.stride_0()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 851bd85ab9..04eda350f8 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -36,7 +36,7 @@ struct SerialQR_Internal { const int n, // n = NumCols(A) /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *w) { + /* */ ValueType *w, const int ws) { typedef ValueType value_type; /// Given a matrix A, it computes QR decomposition of the matrix @@ -69,7 +69,7 @@ struct SerialQR_Internal { // left apply householder to A22 SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + A_part3x3.A22, as0, as1, w, ws); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 56a5619e06..6b4461e5ec 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -176,11 +176,12 @@ struct SerialSVDInternal { if (n - i > 1) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), - As0, As1, work); + As0, As1, work, 1); } if (U) { - KokkosBatched::SerialApplyRightHouseholderInternal::invoke( - m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work); + KokkosBatched::SerialApplyRightHouseholderInternal::invoke(m, m - i - 1, &tau, &SVDIND(A, i + 1, i), + As0, &SVDIND(U, 0, i), Us0, + &SVDIND(U, 0, i + 1), Us0, Us1, work, 1); } // Zero out A subdiag explicitly (NOTE: may not be necessary...) for (int j = i + 1; j < m; j++) { @@ -193,12 +194,12 @@ struct SerialSVDInternal { if (m - i > 1) { KokkosBatched::SerialApplyRightHouseholderInternal::invoke( m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0, - &SVDIND(A, i + 1, i + 2), As0, As1, work); + &SVDIND(A, i + 1, i + 2), As0, As1, work, 1); } if (Vt) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, - Vts1, work); + Vts1, work, 1); } // Zero out A superdiag row explicitly for (int j = i + 2; j < n; j++) { diff --git a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp index 9d17a8b435..589ca192dd 100644 --- a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp @@ -64,11 +64,11 @@ struct ApplyQ { KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { int r_val = 0; - if (std::is_same::value) { + if constexpr (std::is_same_v) { r_val = SerialApplyQ::invoke(A, t, B, w); - } else if (std::is_same::value) { + } else if constexpr (std::is_same_v) { r_val = TeamApplyQ::invoke(member, A, t, B, w); - } else if (std::is_same::value) { + } else if constexpr (std::is_same_v) { r_val = TeamVectorApplyQ::invoke(member, A, t, B, w); } return r_val; diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp index 4ccb81400a..375b1d1664 100644 --- a/batched/dense/unit_test/Test_Batched_SerialQR.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -25,52 +25,92 @@ #include // KokkosBlas::Algo #include -template +template struct qrFunctor { using Scalar = typename MatricesType::value_type; - int maxMatSize; MatricesType As; - IndexViewType numRows; - IndexViewType numCols; - IndexViewType offsetA; TauViewType taus; TmpViewType ws; - MatricesType Qs; - IndexViewType offsetQ; - - qrFunctor(const int maxMatSize_, MatricesType As_, IndexViewType numRows_, IndexViewType numCols_, - IndexViewType offsetA_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, IndexViewType offsetQ_) - : maxMatSize(maxMatSize_), - As(As_), - numRows(numRows_), - numCols(numCols_), - offsetA(offsetA_), - taus(taus_), - ws(ws_), - Qs(Qs_), - offsetQ(offsetQ_) {} + MatricesType Qs, Bs; + ErrorViewType global_error; + + qrFunctor(MatricesType As_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, MatricesType Bs_, + ErrorViewType global_error_) + : As(As_), taus(taus_), ws(ws_), Qs(Qs_), Bs(Bs_), global_error(global_error_) {} KOKKOS_FUNCTION void operator()(const int matIdx) const { - Kokkos::View> A(&As(offsetA(matIdx)), numRows(matIdx), - numCols(matIdx)); - Kokkos::View> tau(&taus(matIdx * maxMatSize), - Kokkos::min(numRows(matIdx), numCols(matIdx))); - Kokkos::View> w(&ws(matIdx * maxMatSize), - Kokkos::min(numRows(matIdx), numCols(matIdx))); - Kokkos::View> Q(&Qs(offsetQ(matIdx)), numRows(matIdx), - numRows(matIdx)); + auto A = Kokkos::subview(As, matIdx, Kokkos::ALL, Kokkos::ALL); + auto tau = Kokkos::subview(taus, matIdx, Kokkos::ALL); + auto w = Kokkos::subview(ws, matIdx, Kokkos::ALL); + auto Q = Kokkos::subview(Qs, matIdx, Kokkos::ALL, Kokkos::ALL); + auto B = Kokkos::subview(Bs, matIdx, Kokkos::ALL, Kokkos::ALL); + + const Scalar SC_one = Kokkos::ArithTraits::one(); + const Scalar tol = Kokkos::ArithTraits::eps() * 10; + + int error = 0; KokkosBatched::SerialQR::invoke(A, tau, w); // Store identity in Q - for (int idx = 0; idx < numRows(matIdx); ++idx) { - Q(matIdx, matIdx) = Kokkos::ArithTraits::one(); + for (int diagIdx = 0; diagIdx < Q.extent_int(0); ++diagIdx) { + Q(diagIdx, diagIdx) = SC_one; } // Call ApplyQ on Q + for (int idx = 0; idx < w.extent_int(0); ++idx) { + w(idx) = 0.0; + } KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + + // Now apply Q' to Q + for (int idx = 0; idx < w.extent_int(0); ++idx) { + w(idx) = 0.0; + } + KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + + // At this point Q stores Q'Q + // which should be the identity matrix + for (int rowIdx = 0; rowIdx < Q.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < Q.extent_int(1); ++colIdx) { + if (rowIdx == colIdx) { + if (Kokkos::abs(Q(rowIdx, colIdx) - SC_one) > tol) { + error += 1; + Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 1.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); + } + } else { + if (Kokkos::abs(Q(rowIdx, colIdx)) > tol) { + error += 1; + Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); + } + } + } + } + Kokkos::atomic_add(&global_error(), error); + + // Apply Q' to B which holds a copy of the orginal A + // Afterwards B should hold a copy of R and be zero below its diagonal + KokkosBatched::SerialApplyQ::invoke(A, tau, B, w); + for (int rowIdx = 0; rowIdx < B.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < B.extent_int(1); ++colIdx) { + if (rowIdx <= colIdx) { + if (Kokkos::abs(B(rowIdx, colIdx) - A(rowIdx, colIdx)) > tol * Kokkos::abs(A(rowIdx, colIdx))) { + error += 1; + Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of %e.\n", matIdx, rowIdx, colIdx, + B(rowIdx, colIdx), A(rowIdx, colIdx)); + } + } else { + if (Kokkos::abs(B(rowIdx, colIdx)) > 1000 * tol) { + error += 1; + Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, + B(rowIdx, colIdx)); + } + } + } + } + Kokkos::atomic_add(&global_error(), error); } }; @@ -159,7 +199,7 @@ void test_QR_square() { Kokkos::parallel_for( "serialFormQ", 1, KOKKOS_LAMBDA(int) { KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), - Q.data(), Q.stride(0), Q.stride(1), w.data()); + Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0)); }); typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); Kokkos::deep_copy(Q_h, Q); @@ -247,8 +287,8 @@ void test_QR_rectangular() { Test::EXPECT_NEAR_KK(A_h(2, 0), static_cast(0.), tol); Test::EXPECT_NEAR_KK_REL(A_h(2, 1), static_cast(1. / 3.), tol); - Test::EXPECT_NEAR_KK_REL(tau(0), 5. / 8., tol); - Test::EXPECT_NEAR_KK_REL(tau(1), 10. / 18., tol); + Test::EXPECT_NEAR_KK_REL(tau(0), static_cast(5. / 8.), tol); + Test::EXPECT_NEAR_KK_REL(tau(1), static_cast(10. / 18.), tol); Kokkos::parallel_for( "serialApplyQ", 1, KOKKOS_LAMBDA(int) { @@ -267,7 +307,7 @@ void test_QR_rectangular() { Kokkos::parallel_for( "serialFormQ", 1, KOKKOS_LAMBDA(int) { KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), - Q.data(), Q.stride(0), Q.stride(1), w.data()); + Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0)); }); typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); Kokkos::deep_copy(Q_h, Q); @@ -310,56 +350,80 @@ void test_QR_batch() { using ExecutionSpace = typename Device::execution_space; - constexpr int numMat = 314; - constexpr int maxMatSize = 100; - Kokkos::View numRows("rows in matrix", numMat); - Kokkos::View numCols("cols in matrix", numMat); - Kokkos::View offsetA("matrix offset", numMat + 1); - Kokkos::View offsetQ("matrix offset", numMat + 1); - Kokkos::View tau("tau", maxMatSize * numMat); - Kokkos::View tmp("work buffer", maxMatSize * numMat); - - typename Kokkos::View::HostMirror numRows_h = Kokkos::create_mirror_view(numRows); - typename Kokkos::View::HostMirror numCols_h = Kokkos::create_mirror_view(numCols); - typename Kokkos::View::HostMirror offsetA_h = Kokkos::create_mirror_view(offsetA); - typename Kokkos::View::HostMirror offsetQ_h = Kokkos::create_mirror_view(offsetQ); - - std::mt19937 gen; - gen.seed(27182818); - std::uniform_int_distribution distrib(1, maxMatSize); - - offsetA_h(0) = 0; - offsetQ_h(0) = 0; - int a = 0, b = 0; - for (int matIdx = 0; matIdx < numMat; ++matIdx) { - a = distrib(gen); - b = distrib(gen); - - numRows_h(matIdx) = Kokkos::max(a, b); - numCols_h(matIdx) = Kokkos::min(a, b); - offsetA_h(matIdx + 1) = offsetA_h(matIdx) + a * b; - offsetQ_h(matIdx + 1) = offsetQ_h(matIdx) + numRows_h(matIdx) * numRows_h(matIdx); - } + { // Square matrix case + constexpr int numMat = 314; // 314 + constexpr int maxMatSize = 36; // 36 + Kokkos::View tau("tau", numMat, maxMatSize); + Kokkos::View tmp("work buffer", numMat, maxMatSize); + Kokkos::View As("A matrices", numMat, maxMatSize, maxMatSize); + Kokkos::View Bs("B matrices", numMat, maxMatSize, maxMatSize); + Kokkos::View Qs("Q matrices", numMat, maxMatSize, maxMatSize); + Kokkos::View global_error("global number of error"); + + Kokkos::Random_XorShift64_Pool rand_pool(2718); + constexpr double max_val = 1000; + { + Scalar randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd); + } + Kokkos::deep_copy(Bs, As); + + qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error); + Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); - Kokkos::deep_copy(numRows, numRows_h); - Kokkos::deep_copy(numCols, numCols_h); - Kokkos::deep_copy(offsetA, offsetA_h); - Kokkos::deep_copy(offsetQ, offsetQ_h); - - const int numVals = offsetA_h(numMat); - Kokkos::View mats("matrices", numVals); - Kokkos::View Qs("Q matrices", offsetQ_h(numMat)); - - Kokkos::Random_XorShift64_Pool rand_pool(2718); - constexpr double max_val = 1000; - { - Scalar randStart, randEnd; - Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(ExecutionSpace{}, mats, rand_pool, randStart, randEnd); + typename Kokkos::View::HostMirror global_error_h = Kokkos::create_mirror_view(global_error); + Kokkos::deep_copy(global_error_h, global_error); + EXPECT_EQ(global_error_h(), 0); } - qrFunctor myFunc(maxMatSize, mats, numRows, numCols, offsetA, tau, tmp, Qs, offsetQ); - Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); + { // Rectangular matrix case + constexpr int numMat = 2; // 314 + constexpr int numRows = 3; // 42 + constexpr int numCols = 2; // 36 + Kokkos::View tau("tau", numMat, numCols); + Kokkos::View tmp("work buffer", numMat, numCols); + Kokkos::View As("A matrices", numMat, numRows, numCols); + Kokkos::View Bs("B matrices", numMat, numRows, numCols); + Kokkos::View Qs("Q matrices", numMat, numRows, numRows); + Kokkos::View global_error("global number of error"); + + Kokkos::Random_XorShift64_Pool rand_pool(2718); + constexpr double max_val = 1000; + { + Scalar randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd); + } + + { + typename Kokkos::View::HostMirror As_h = Kokkos::create_mirror_view(As); + Kokkos::deep_copy(As_h, As); + As_h(0, 0, 0) = 3.0; + As_h(0, 0, 1) = 5.0; + As_h(0, 1, 0) = 4.0; + As_h(0, 1, 1) = 0.0; + As_h(0, 2, 0) = 0.0; + As_h(0, 2, 1) = -3.0; + + As_h(1, 0, 0) = 3.0; + As_h(1, 0, 1) = 5.0; + As_h(1, 1, 0) = 4.0; + As_h(1, 1, 1) = 0.0; + As_h(1, 2, 0) = 0.0; + As_h(1, 2, 1) = -3.0; + + Kokkos::deep_copy(As, As_h); + } + Kokkos::deep_copy(Bs, As); + + qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error); + Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); + + typename Kokkos::View::HostMirror global_error_h = Kokkos::create_mirror_view(global_error); + Kokkos::deep_copy(global_error_h, global_error); + EXPECT_EQ(global_error_h(), 0); + } } #if defined(KOKKOSKERNELS_INST_FLOAT)