From 12041681bd8b3f008fc05c6eb2638238e6bd093f Mon Sep 17 00:00:00 2001
From: Nils Vu <owls@nilsvu.de>
Date: Wed, 16 Feb 2022 11:30:35 +0100
Subject: [PATCH 1/2] Use Blaze matrix spacing in DgSubcell BLAS calls

Blaze adds padding to the matrix data layout to increase performance
(currently disabled). All other BLAS calls in the repo support padding,
but these functions were added after padding was disabled, so they
don't support padding yet.
---
 src/Evolution/DgSubcell/Matrices.cpp             | 15 ++++++++-------
 src/Evolution/DgSubcell/Reconstruction.cpp       | 10 +++++-----
 tests/Unit/Evolution/DgSubcell/Test_Matrices.cpp |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/src/Evolution/DgSubcell/Matrices.cpp b/src/Evolution/DgSubcell/Matrices.cpp
index 296141753971..e9c41879bf46 100644
--- a/src/Evolution/DgSubcell/Matrices.cpp
+++ b/src/Evolution/DgSubcell/Matrices.cpp
@@ -309,13 +309,13 @@ Matrix reconstruction_matrix_cache_impl_helper(
   // We use rhs_recons_matrix here as a temp buffer, we will fill it later.
   Matrix rhs_recons_matrix(num_pts + 1, num_subcells);
   dgemm_<true>('T', 'N', proj_matrix.columns(), proj_matrix.columns(),
-               proj_matrix.rows(), 2.0, proj_matrix.data(), proj_matrix.rows(),
-               proj_matrix.data(), proj_matrix.rows(), 0.0,
-               rhs_recons_matrix.data(), proj_matrix.columns());
+               proj_matrix.rows(), 2.0, proj_matrix.data(),
+               proj_matrix.spacing(), proj_matrix.data(), proj_matrix.spacing(),
+               0.0, rhs_recons_matrix.data(), rhs_recons_matrix.spacing());
 
   for (size_t l = 0; l < num_pts; ++l) {
     for (size_t j = 0; j < num_pts; ++j) {
-      lhs_recons_matrix(l, j) = *(rhs_recons_matrix.data() + (j + l * num_pts));
+      lhs_recons_matrix(l, j) = rhs_recons_matrix(j, l);
     }
   }
 
@@ -363,9 +363,10 @@ Matrix reconstruction_matrix_cache_impl_helper(
   // than Blaze.
   dgemm_<true>('N', 'N', inv_lhs_recons_matrix.rows(),
                rhs_recons_matrix.columns(), inv_lhs_recons_matrix.columns(),
-               1.0, inv_lhs_recons_matrix.data(), inv_lhs_recons_matrix.rows(),
-               rhs_recons_matrix.data(), inv_lhs_recons_matrix.columns(), 0.0,
-               full_recons_matrix.data(), inv_lhs_recons_matrix.rows());
+               1.0, inv_lhs_recons_matrix.data(),
+               inv_lhs_recons_matrix.spacing(), rhs_recons_matrix.data(),
+               rhs_recons_matrix.spacing(), 0.0, full_recons_matrix.data(),
+               full_recons_matrix.spacing());
   // exclude bottom row for Lagrange multiplier.
   Matrix reduced_recons_matrix(num_pts, num_subcells);
   for (size_t i = 0; i < num_pts; ++i) {
diff --git a/src/Evolution/DgSubcell/Reconstruction.cpp b/src/Evolution/DgSubcell/Reconstruction.cpp
index ba306e63838b..147ed343684b 100644
--- a/src/Evolution/DgSubcell/Reconstruction.cpp
+++ b/src/Evolution/DgSubcell/Reconstruction.cpp
@@ -32,11 +32,11 @@ void reconstruct_impl(
   if (reconstruction_method == ReconstructionMethod::AllDimsAtOnce) {
     const Matrix& recons_matrix =
         reconstruction_matrix(dg_mesh, subcell_extents);
-    dgemm_<true>('N', 'N', recons_matrix.rows(), number_of_components,
-                 recons_matrix.columns(), 1.0, recons_matrix.data(),
-                 recons_matrix.rows(), subcell_u_times_projected_det_jac.data(),
-                 recons_matrix.columns(), 0.0, dg_u.data(),
-                 recons_matrix.rows());
+    dgemm_<true>(
+        'N', 'N', recons_matrix.rows(), number_of_components,
+        recons_matrix.columns(), 1.0, recons_matrix.data(),
+        recons_matrix.spacing(), subcell_u_times_projected_det_jac.data(),
+        recons_matrix.columns(), 0.0, dg_u.data(), recons_matrix.rows());
   } else {
     ASSERT(reconstruction_method == ReconstructionMethod::DimByDim,
            "reconstruction_method must be either DimByDim or AllDimsAtOnce");
diff --git a/tests/Unit/Evolution/DgSubcell/Test_Matrices.cpp b/tests/Unit/Evolution/DgSubcell/Test_Matrices.cpp
index 01e37b3d31d6..3b3c275a3f85 100644
--- a/tests/Unit/Evolution/DgSubcell/Test_Matrices.cpp
+++ b/tests/Unit/Evolution/DgSubcell/Test_Matrices.cpp
@@ -221,8 +221,8 @@ void reconstruction_matrix(const double eps) {
 
     DataVector reconstructed_nodal_coeffs(num_pts);
     dgemv_('N', single_recons.rows(), single_recons.columns(), 1.0,
-           single_recons.data(), single_recons.rows(), subcell_values.data(), 1,
-           0.0, reconstructed_nodal_coeffs.data(), 1);
+           single_recons.data(), single_recons.spacing(), subcell_values.data(),
+           1, 0.0, reconstructed_nodal_coeffs.data(), 1);
 
     CHECK_ITERABLE_CUSTOM_APPROX(expected_nodal_coeffs,
                                  reconstructed_nodal_coeffs, local_approx);

From ff40b91b9232b94a243e67478d66ace2d1a71548 Mon Sep 17 00:00:00 2001
From: Nils Vu <owls@nilsvu.de>
Date: Thu, 17 Feb 2022 11:55:03 +0100
Subject: [PATCH 2/2] Blaze: add comments on padding

Blaze warns that disabling padding can decrease performance considerably:
https://bitbucket.org/blaze-lib/blaze/src/c4d9e85414370e880e5e79c86e3c8d4d38dcde7a/blaze/config/Optimizations.h#lines-52
To support padding, explicit calls to LAPACK functions need to
pass `.spacing()` instead of `.rows()/.columns()` to the `LDA`,
`LDB`, etc. parameters.
---
 cmake/SetupBlaze.cmake                                    | 7 +++++++
 tests/Unit/NumericalAlgorithms/Spectral/Test_Spectral.cpp | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/cmake/SetupBlaze.cmake b/cmake/SetupBlaze.cmake
index c8f99ac52ab9..2b583cce9d92 100644
--- a/cmake/SetupBlaze.cmake
+++ b/cmake/SetupBlaze.cmake
@@ -130,6 +130,13 @@ target_compile_definitions(Blaze
   #   by the Blaze CMake configuration for the machine we are running on. We
   #   could override it here explicitly to tune performance.
   # BLAZE_CACHE_SIZE
+  # - Disable padding for dynamic matrices.
+  #   Blaze warns that this may decrease performance:
+  #   https://bitbucket.org/blaze-lib/blaze/src/c4d9e85414370e880e5e79c86e3c8d4d38dcde7a/blaze/config/Optimizations.h#lines-52
+  #   We haven't tested this much, so we may want to try enabling padding again.
+  #   To support padding, explicit calls to LAPACK functions need to pass
+  #   `.spacing()` instead of `.rows()/.columns()` to the `LDA`, `LDB`, etc.
+  #   parameters (see `[matrix_spacing]` in `Test_Spectral.cpp`).
   BLAZE_USE_PADDING=0
   # - Always enable non-temporal stores for cache optimization of large data
   #   structures: https://bitbucket.org/blaze-lib/blaze/wiki/Configuration%20Files#!streaming-non-temporal-stores
diff --git a/tests/Unit/NumericalAlgorithms/Spectral/Test_Spectral.cpp b/tests/Unit/NumericalAlgorithms/Spectral/Test_Spectral.cpp
index 9cf6b50c7a39..7f30d09ee982 100644
--- a/tests/Unit/NumericalAlgorithms/Spectral/Test_Spectral.cpp
+++ b/tests/Unit/NumericalAlgorithms/Spectral/Test_Spectral.cpp
@@ -62,8 +62,10 @@ void test_exact_differentiation_impl(const Function& max_poly_deg) {
           Spectral::differentiation_matrix<BasisType, QuadratureType>(n);
       const auto u = unit_polynomial(p, collocation_pts);
       DataVector numeric_derivative{n};
+      // [matrix_spacing]
       dgemv_('N', n, n, 1., diff_matrix.data(), diff_matrix.spacing(), u.data(),
              1, 0.0, numeric_derivative.data(), 1);
+      // [matrix_spacing]
       const auto analytic_derivative =
           unit_polynomial_derivative(p, collocation_pts);
       if (BasisType == Spectral::Basis::FiniteDifference and n > 20) {