Skip to content

Commit

Permalink
Batched - Dense Serial QR: fixing issue with work array
Browse files Browse the repository at this point in the history
We did not pass the stride of the work array to internal routines
and we are not enforcing a contiguous memory allocation either so
when using subviews to pass the work array, we run into troubles.

Signed-off-by: Luc Berger-Vergiat <[email protected]>
  • Loading branch information
lucbv committed Dec 4, 2024
1 parent 66c5cc9 commit f173adf
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 79 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ struct SerialApplyLeftHouseholderInternal {
/* */ ValueType* u2, const int u2s,
/* */ ValueType* a1t, const int a1ts,
/* */ ValueType* A2, const int as0, const int as1,
/* */ ValueType* w1t) {
typedef ValueType value_type;
/* */ ValueType* w1t, const int ws0) {
using value_type = ValueType;

/// u2 m x 1
/// a1t 1 x n
Expand All @@ -54,15 +54,15 @@ struct SerialApplyLeftHouseholderInternal {
for (int j = 0; j < n; ++j) {
value_type tmp = a1t[j * a1ts];
for (int i = 0; i < m; ++i) tmp += Kokkos::ArithTraits<value_type>::conj(u2[i * u2s]) * A2[i * as0 + j * as1];
w1t[j] = tmp * inv_tau; // /= (*tau);
w1t[j * ws0] = tmp * inv_tau; // /= (*tau);
}

// a1t -= w1t (axpy)
for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j];
for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j * ws0];

// A2 -= u2 w1t (ger)
for (int j = 0; j < n; ++j)
for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j];
for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j * ws0];

return 0;
}
Expand All @@ -74,8 +74,8 @@ struct SerialApplyRightHouseholderInternal {
/* */ ValueType* u2, const int u2s,
/* */ ValueType* a1, const int a1s,
/* */ ValueType* A2, const int as0, const int as1,
/* */ ValueType* w1) {
typedef ValueType value_type;
/* */ ValueType* w1, const int ws0) {
using value_type = ValueType;
/// u2 n x 1
/// a1 m x 1
/// A2 m x n
Expand All @@ -93,15 +93,15 @@ struct SerialApplyRightHouseholderInternal {
for (int i = 0; i < m; ++i) {
value_type tmp = a1[i * a1s];
for (int j = 0; j < n; ++j) tmp += A2[i * as0 + j * as1] * u2[j * u2s];
w1[i] = tmp * inv_tau; // \= (*tau);
w1[i * ws0] = tmp * inv_tau; // \= (*tau);
}

// a1 -= w1 (axpy)
for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i];
for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i * ws0];

// A2 -= w1 * u2' (ger with conjugate)
for (int j = 0; j < n; ++j)
for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits<ValueType>::conj(u2[j * u2s]);
for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i * ws0] * Kokkos::ArithTraits<ValueType>::conj(u2[j * u2s]);

return 0;
}
Expand Down
6 changes: 3 additions & 3 deletions batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ<Side::Left, Trans::NoTranspose, Algo::Ap
const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) {
return SerialApplyQ_LeftForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
B.stride_1(), w.data());
B.stride_1(), w.data(), w.stride_0());
}

template <>
Expand All @@ -42,7 +42,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ<Side::Left, Trans::Transpose, Algo::Appl
const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) {
return SerialApplyQ_LeftBackwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
B.stride_1(), w.data());
B.stride_1(), w.data(), w.stride_0());
}

template <>
Expand All @@ -51,7 +51,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ<Side::Right, Trans::NoTranspose, Algo::A
const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) {
return SerialApplyQ_RightForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
B.stride_1(), w.data());
B.stride_1(), w.data(), w.strid_0());
}

} // namespace KokkosBatched
Expand Down
13 changes: 6 additions & 7 deletions batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct SerialApplyQ_LeftForwardInternal {
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *B, const int bs0, const int bs1,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A that includes a series of householder vectors,
Expand Down Expand Up @@ -73,7 +73,7 @@ struct SerialApplyQ_LeftForwardInternal {
/// -----------------------------------------------------
// left apply householder to partitioned B1 and B2
SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0,
bs1, w);
bs1, w, ws);

/// -----------------------------------------------------
A_part2x2.mergeToABR(A_part3x3);
Expand All @@ -90,7 +90,7 @@ struct SerialApplyQ_LeftBackwardInternal {
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *B, const int bs0, const int bs1,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A that includes a series of householder vectors,
Expand Down Expand Up @@ -127,8 +127,7 @@ struct SerialApplyQ_LeftBackwardInternal {
/// -----------------------------------------------------
// left apply householder to partitioned B1 and B2
SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0,
bs1, w);

bs1, w, ws);
/// -----------------------------------------------------
A_part2x2.mergeToATL(A_part3x3);
t_part2x1.mergeToAT(t_part3x1);
Expand All @@ -144,7 +143,7 @@ struct SerialApplyQ_RightForwardInternal {
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *B, const int bs0, const int bs1,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A that includes a series of householder vectors,
Expand Down Expand Up @@ -181,7 +180,7 @@ struct SerialApplyQ_RightForwardInternal {
/// -----------------------------------------------------
// right apply householder to partitioned B1 and B2
SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, B_part1x3.A2,
bs0, bs1, w);
bs0, bs1, ws);
/// -----------------------------------------------------
A_part2x2.mergeToATL(A_part3x3);
t_part2x1.mergeToAT(t_part3x1);
Expand Down
6 changes: 3 additions & 3 deletions batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ struct SerialQR_FormQ_Internal {
/* */ ValueType* A, const int as0, const int as1,
/* */ ValueType* t, const int ts,
/* */ ValueType* Q, const int qs0, const int qs1,
/* */ ValueType* w, const bool is_Q_zero = false) {
typedef ValueType value_type;
/* */ ValueType* w, const int ws, const bool is_Q_zero = false) {
using value_type = ValueType;

/// Given a matrix A that includes QR factorization
/// it forms a unitary matrix Q
Expand All @@ -58,7 +58,7 @@ struct SerialQR_FormQ_Internal {
SerialSetIdentityInternal::invoke(m, m, Q, qs0, qs1);
}

return SerialApplyQ_LeftForwardInternal::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w);
return SerialApplyQ_LeftForwardInternal::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w, ws);
}
};

Expand Down
2 changes: 1 addition & 1 deletion batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ template <typename AViewType, typename tViewType, typename wViewType>
KOKKOS_INLINE_FUNCTION int SerialQR<Algo::QR::Unblocked>::invoke(const AViewType &A, const tViewType &t,
const wViewType &w) {
return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(),
t.stride_0(), w.data());
t.stride_0(), w.data(), w.stride_0());
}

} // namespace KokkosBatched
Expand Down
4 changes: 2 additions & 2 deletions batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct SerialQR_Internal {
const int n, // n = NumCols(A)
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A, it computes QR decomposition of the matrix
Expand Down Expand Up @@ -69,7 +69,7 @@ struct SerialQR_Internal {

// left apply householder to A22
SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1,
A_part3x3.A22, as0, as1, w);
A_part3x3.A22, as0, as1, w, ws);
/// -----------------------------------------------------
A_part2x2.mergeToATL(A_part3x3);
t_part2x1.mergeToAT(t_part3x1);
Expand Down
8 changes: 4 additions & 4 deletions batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,11 @@ struct SerialSVDInternal {
if (n - i > 1) {
KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(
m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1),
As0, As1, work);
As0, As1, work, 1);
}
if (U) {
KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(
m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work);
m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work, 1);
}
// Zero out A subdiag explicitly (NOTE: may not be necessary...)
for (int j = i + 1; j < m; j++) {
Expand All @@ -193,12 +193,12 @@ struct SerialSVDInternal {
if (m - i > 1) {
KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(
m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0,
&SVDIND(A, i + 1, i + 2), As0, As1, work);
&SVDIND(A, i + 1, i + 2), As0, As1, work, 1);
}
if (Vt) {
KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(
n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0,
Vts1, work);
Vts1, work, 1);
}
// Zero out A superdiag row explicitly
for (int j = i + 2; j < n; j++) {
Expand Down
6 changes: 3 additions & 3 deletions batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ struct ApplyQ {
KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t,
const BViewType &B, const wViewType &w) {
int r_val = 0;
if (std::is_same<ArgMode, Mode::Serial>::value) {
if constexpr (std::is_same_v<ArgMode, Mode::Serial>) {
r_val = SerialApplyQ<ArgSide, ArgTrans, ArgAlgo>::invoke(A, t, B, w);
} else if (std::is_same<ArgMode, Mode::Team>::value) {
} else if constexpr (std::is_same_v<ArgMode, Mode::Team>) {
r_val = TeamApplyQ<MemberType, ArgSide, ArgTrans, ArgAlgo>::invoke(member, A, t, B, w);
} else if (std::is_same<ArgMode, Mode::Team>::value) {
} else if constexpr (std::is_same_v<ArgMode, Mode::TeamVector>) {
r_val = TeamVectorApplyQ<MemberType, ArgSide, ArgTrans, ArgAlgo>::invoke(member, A, t, B, w);
}
return r_val;
Expand Down
76 changes: 30 additions & 46 deletions batched/dense/unit_test/Test_Batched_SerialQR.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,29 +53,19 @@ struct qrFunctor {
int error = 0;

KokkosBatched::SerialQR<KokkosBlas::Algo::QR::Unblocked>::invoke(A, tau, w);
Kokkos::printf("A%d = [%5.2f, %5.2f]\n [%5.2f, %5.2f]\n [%5.2f, %5.2f]\n",
matIdx, A(0, 0), A(0, 1), A(1, 0), A(1, 1), A(2, 0), A(2, 1));
Kokkos::printf("tau%d = [%5.3f, %5.3f]\n", matIdx, tau(0), tau(1));

// Store identity in Q
for (int diagIdx = 0; diagIdx < Q.extent_int(0); ++diagIdx) {
Q(diagIdx, diagIdx) = SC_one;
}
Kokkos::printf("Q stores I\nQ%d = [%5.2f, %5.2f, %5.2f]\n [%5.2f, %5.2f, %5.2f]\n [%5.2f, %5.2f, %5.2f]\n",
matIdx, Q(0, 0), Q(0, 1), Q(0, 2), Q(1, 0), Q(1, 1), Q(1, 2), Q(2, 0), Q(2, 1), Q(2, 2));

// Call ApplyQ on Q
w(0) = 0.0, w(1) = 0.0;
for(int idx = 0; idx < w.extent_int(0); ++idx) { w(idx) = 0.0; }
KokkosBatched::SerialApplyQ<Side::Left, Trans::NoTranspose, Algo::ApplyQ::Unblocked>::invoke(A, tau, Q, w);
Kokkos::printf("\nQ stores Q\nQ%d = [%5.2f, %5.2f, %5.2f]\n [%5.2f, %5.2f, %5.2f]\n [%5.2f, %5.2f, %5.2f]\n",
matIdx, Q(0, 0), Q(0, 1), Q(0, 2), Q(1, 0), Q(1, 1), Q(1, 2), Q(2, 0), Q(2, 1), Q(2, 2));

// Now apply Q' to Q
w(0) = 0.0, w(1) = 0.0;
// KokkosBatched::SerialApplyQ<Side::Left, Trans::Transpose, Algo::ApplyQ::Unblocked>::invoke(A, tau, Q, w);
for(int idx = 0; idx < w.extent_int(0); ++idx) { w(idx) = 0.0; }
KokkosBatched::SerialApplyQ<Side::Left, Trans::Transpose, Algo::ApplyQ::Unblocked>::invoke(A, tau, Q, w);
Kokkos::printf("\nQ stores Q'Q=I\nQ%d = [%5.2f, %5.2f, %5.2f]\n [%5.2f, %5.2f, %5.2f]\n [%5.2f, %5.2f, %5.2f]\n",
matIdx, Q(0, 0), Q(0, 1), Q(0, 2), Q(1, 0), Q(1, 1), Q(1, 2), Q(2, 0), Q(2, 1), Q(2, 2));

// At this point Q stores Q'Q
// which should be the identity matrix
Expand Down Expand Up @@ -203,7 +193,7 @@ void test_QR_square() {
Kokkos::parallel_for(
"serialFormQ", 1, KOKKOS_LAMBDA(int) {
KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0),
Q.data(), Q.stride(0), Q.stride(1), w.data());
Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0));
});
typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q);
Kokkos::deep_copy(Q_h, Q);
Expand Down Expand Up @@ -311,7 +301,7 @@ void test_QR_rectangular() {
Kokkos::parallel_for(
"serialFormQ", 1, KOKKOS_LAMBDA(int) {
KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0),
Q.data(), Q.stride(0), Q.stride(1), w.data());
Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0));
});
typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q);
Kokkos::deep_copy(Q_h, Q);
Expand Down Expand Up @@ -354,32 +344,32 @@ void test_QR_batch() {

using ExecutionSpace = typename Device::execution_space;

// { // Square matrix case
// constexpr int numMat = 1; // 314
// constexpr int maxMatSize = 6; // 36
// Kokkos::View<Scalar**, ExecutionSpace> tau("tau", numMat, maxMatSize);
// Kokkos::View<Scalar**, ExecutionSpace> tmp("work buffer", numMat, maxMatSize);
// Kokkos::View<Scalar***, ExecutionSpace> As("A matrices", numMat, maxMatSize, maxMatSize);
// Kokkos::View<Scalar***, ExecutionSpace> Bs("B matrices", numMat, maxMatSize, maxMatSize);
// Kokkos::View<Scalar***, ExecutionSpace> Qs("Q matrices", numMat, maxMatSize, maxMatSize);
// Kokkos::View<int, ExecutionSpace> global_error("global number of error");

// Kokkos::Random_XorShift64_Pool<ExecutionSpace> rand_pool(2718);
// constexpr double max_val = 1000;
// {
// Scalar randStart, randEnd;
// Test::getRandomBounds(max_val, randStart, randEnd);
// Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd);
// }
// Kokkos::deep_copy(Bs, As);

// qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error);
// Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy<ExecutionSpace>(0, numMat), myFunc);

// typename Kokkos::View<int, ExecutionSpace>::HostMirror global_error_h = Kokkos::create_mirror_view(global_error);
// Kokkos::deep_copy(global_error_h, global_error);
// EXPECT_EQ(global_error_h(), 0);
// }
{ // Square matrix case
constexpr int numMat = 314; // 314
constexpr int maxMatSize = 36; // 36
Kokkos::View<Scalar**, ExecutionSpace> tau("tau", numMat, maxMatSize);
Kokkos::View<Scalar**, ExecutionSpace> tmp("work buffer", numMat, maxMatSize);
Kokkos::View<Scalar***, ExecutionSpace> As("A matrices", numMat, maxMatSize, maxMatSize);
Kokkos::View<Scalar***, ExecutionSpace> Bs("B matrices", numMat, maxMatSize, maxMatSize);
Kokkos::View<Scalar***, ExecutionSpace> Qs("Q matrices", numMat, maxMatSize, maxMatSize);
Kokkos::View<int, ExecutionSpace> global_error("global number of error");

Kokkos::Random_XorShift64_Pool<ExecutionSpace> rand_pool(2718);
constexpr double max_val = 1000;
{
Scalar randStart, randEnd;
Test::getRandomBounds(max_val, randStart, randEnd);
Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd);
}
Kokkos::deep_copy(Bs, As);

qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error);
Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy<ExecutionSpace>(0, numMat), myFunc);

typename Kokkos::View<int, ExecutionSpace>::HostMirror global_error_h = Kokkos::create_mirror_view(global_error);
Kokkos::deep_copy(global_error_h, global_error);
EXPECT_EQ(global_error_h(), 0);
}

{ // Rectangular matrix case
constexpr int numMat = 2; // 314
Expand Down Expand Up @@ -410,12 +400,6 @@ void test_QR_batch() {
As_h(1, 0, 0) = 3.0; As_h(1, 0, 1) = 5.0;
As_h(1, 1, 0) = 4.0; As_h(1, 1, 1) = 0.0;
As_h(1, 2, 0) = 0.0; As_h(1, 2, 1) = -3.0;
std::cout << "A0 = [" << As_h(0, 0, 0) << ", " << As_h(0, 0, 1) << "]\n"
<< " [" << As_h(0, 1, 0) << ", " << As_h(0, 1, 1) << "]\n"
<< " [" << As_h(0, 2, 0) << ", " << As_h(0, 2, 1) << "]\n" << std::endl;
std::cout << "A1 = [" << As_h(1, 0, 0) << ", " << As_h(1, 0, 1) << "]\n"
<< " [" << As_h(1, 1, 0) << ", " << As_h(1, 1, 1) << "]\n"
<< " [" << As_h(1, 2, 0) << ", " << As_h(1, 2, 1) << "]" << std::endl;

Kokkos::deep_copy(As, As_h);
}
Expand Down

0 comments on commit f173adf

Please sign in to comment.