Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Matrix][E2E] Organize matrix tests #16563

Merged
merged 14 commits into from
Jan 23, 2025
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,17 @@ template <size_t TM, size_t TN, size_t TK> class MatMul;

template <
#if !defined(ARG_DIM) && !defined(RUNTIME_DIM)
size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
#endif // ARG_DIM, RUNTIME_DIM
size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
#if defined(ARG_DIM) || defined(RUNTIME_DIM)
, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
,
size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
#endif // ARG_DIM, RUNTIME_DIM
) {
) {

size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
Expand Down Expand Up @@ -118,12 +119,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
// pm1B and pn1B are used to identify the distribution of subgroups
// along the workgroup prefetch for B matrix. For A matrix, sgId is
// enough.
size_t pm1B = sgId / 16; // prefetch m1 (sgId/16)
size_t pn1B = sgId & 0xF; // prefetch n1 (sgId%16)
#else // VNNI
size_t pm1B = sgId / 16; // prefetch m1 (sgId/16)
size_t pn1B = sgId & 0xF; // prefetch n1 (sgId%16)
#else // VNNI
size_t pm1B = sgId / 8; // prefetch m1 (sgId/8)
size_t pn1B = sgId & 0x7; // prefetch n1 (sgId%8)
#endif // VNNI
#endif // VNNI
constexpr size_t prefDistance = 3;
for (int p = 0; p < prefDistance; p++)
joint_matrix_prefetch<prefRow, prefCol>(
Expand Down Expand Up @@ -306,8 +307,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
pm1B * prefRow) *
(colsB)*vnniFactor +
(n2 * NCache2 * vnniFactor + pn1B * prefCol);
if ((prefetch_offsetB + (prefRow * colsB * vnniFactor) +
prefCol) < (rowsB * colsB))
if ((prefetch_offsetB + (prefRow * colsB * vnniFactor) + prefCol) <
(rowsB * colsB))
joint_matrix_prefetch<prefRow, prefCol>(
sg, B + prefetch_offsetB, colsB * vnniFactor,
layout::row_major,
Expand Down Expand Up @@ -395,18 +396,17 @@ void test(size_t matrix_size_input) {
// run testIterations time, aggregate and calculate average run time
double totalDuration = 0;
for (unsigned int i = 0; i < testIterations; i++) {
double duration =
joint_matmul<
double duration = joint_matmul<
#if !defined(ARG_DIM) && !defined(RUNTIME_DIM)
matrix_size, matrix_size, matrix_size, matrix_size,
matrix_size, matrix_size, matrix_size, matrix_size,
#endif // ARG_DIM, RUNTIME_DIM
vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>
(A, B, C, q, i
vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1, KCache1, MCache2,
NCache2, KCache2>(A, B, C, q, i
#if defined(ARG_DIM) || defined(RUNTIME_DIM)
, matrix_size, matrix_size, matrix_size, matrix_size
,
matrix_size, matrix_size, matrix_size, matrix_size
#endif // ARG_DIM, RUNTIME_DIM
);
);

if (i >= recordThresh) {
totalDuration += duration;
Expand All @@ -431,19 +431,19 @@ void test(size_t matrix_size_input) {

int main(
#ifdef RUNTIME_DIM
int argc, char *argv[]
#endif //RUNTIME_DIM
) {
int argc, char *argv[]
#endif // RUNTIME_DIM
) {

size_t matrix_size = -1;
size_t matrix_size = -1;
#ifdef RUNTIME_DIM
if (argc == 2) {
matrix_size = std::stoul(argv[1]);
} else {
std::cerr << "Usage: ./program matrix_size\n";
return 1; // Error if no argument
}
#endif //RUNTIME_DIM
#endif // RUNTIME_DIM

queue q;
std::vector<combination> combinations =
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../element_wise_abc_impl.hpp"
#include "element_wise_abc_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_all_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../element_wise_all_ops_impl.hpp"
#include "element_wise_all_ops_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../element_wise_all_ops_half_impl.hpp"
#include "element_wise_all_ops_half_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../element_wise_all_ops_int8_impl.hpp"
#include "element_wise_all_ops_int8_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

// This test stores the matrix B that is VNNIed (packed).

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../element_wise_all_ops_int8_packed_impl.hpp"
#include "element_wise_all_ops_int8_packed_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;

#include "../element_wise_all_ops_tf32_impl.hpp"
#include "element_wise_all_ops_tf32_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../element_wise_all_sizes_impl.hpp"
#include "element_wise_all_sizes_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../element_wise_ops_impl.hpp"
#include "element_wise_ops_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../get_coordinate_ops_impl.hpp"
#include "get_coordinate_ops_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_all_sizes_impl.hpp"
#include "joint_matrix_all_sizes_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
// XFAIL: gpu
// XFAIL-TRACKER: GSD-4181

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;

#include "../joint_matrix_annotated_ptr_impl.hpp"
#include "joint_matrix_annotated_ptr_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_apply_bf16_impl.hpp"
#include "joint_matrix_apply_bf16_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} %fp-model-precise -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_apply_two_matrices_impl.hpp"
#include "joint_matrix_apply_two_matrices_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

// -ffp-model=precise is added to not depend on compiler defaults.

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

// -ffp-model=precise is added to not depend on compiler defaults.

#include "../common.hpp"
#include "common.hpp"
#define SG_SZ 32

#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

// -ffp-model=precise is added to not depend on compiler defaults.

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
// since IGC doesn't support some variants of IR for Joint Matrix currently
// -ffp-model=precise is added to not depend on compiler defaults.

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
// since IGC doesn't support some variants of IR for Joint Matrix currently
// -ffp-model=precise is added to not depend on compiler defaults.

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bf16_rowmajorB_load_store_impl.hpp"
#include "joint_matrix_bf16_rowmajorB_load_store_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bf16_rowmajorB_pair_load_store_impl.hpp"
#include "joint_matrix_bf16_rowmajorB_pair_load_store_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bfloat16_impl.hpp"
#include "joint_matrix_bfloat16_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_bfloat16_array_impl.hpp"
#include "joint_matrix_bfloat16_array_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@
// XFAIL: gpu
// XFAIL-TRACKER: GSD-5768

#include "../common.hpp"
#include "common.hpp"
#define SG_SZ 32
#include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
#include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
// XFAIL: gpu
// XFAIL-TRACKER: GSD-4181

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32
#include "../joint_matrix_bfloat16_packedB_impl.hpp"
#include "joint_matrix_bfloat16_packedB_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
// XFAIL: gpu && run-mode
// XFAIL-TRACKER: GSD-5768

#include "../common.hpp"
#include "common.hpp"
#define SG_SZ 32
#include "../joint_matrix_colA_rowB_colC_impl.hpp"
#include "joint_matrix_colA_rowB_colC_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_down_convert_impl.hpp"
#include "joint_matrix_down_convert_impl.hpp"
4 changes: 2 additions & 2 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../common.hpp"
#include "common.hpp"

#define SG_SZ 32

#include "../joint_matrix_half_impl.hpp"
#include "joint_matrix_half_impl.hpp"
Loading
Loading