From 3cc217acfae957c1baa23bf0cf5ebab5fedd33b1 Mon Sep 17 00:00:00 2001 From: RahulSudarMCW Date: Tue, 7 Jan 2025 15:33:51 +0530 Subject: [PATCH] Auto generate script for header --- scripts/generate-tests.sh | 1 - src/bf16-gemm/bf16-gemm-minmax.h | 65 +- src/bf16-gemm/bf16-gemm-minmax1.h | 33 + src/f16-f32acc-gemm/f16-f32acc-gemm-minmax.h | 32 +- .../f16-f32acc-igemm-minmax.h | 20 + src/f16-gemm/f16-gemm-minmax.h | 102 ++-- src/f16-igemm/f16-igemm-minmax.h | 58 ++ src/f32-gemm/f32-gemm-goi-minmax.h | 14 + src/f32-gemm/f32-gemm-minmax.h | 574 +++++++++--------- src/f32-gemm/f32-gemm-relu.h | 103 ++-- src/f32-igemm/f32-igemm-minmax.h | 303 +++++++++ src/f32-igemm/f32-igemm-relu.h | 63 ++ src/f32-ppmm/f32-ppmm-minmax.h | 59 ++ src/f32-qc4w-gemm/f32-qc4w-gemm-minmax.h | 111 ++++ src/f32-qc8w-gemm/f32-qc8w-gemm-minmax.h | 247 ++++++++ src/f32-qc8w-gemm/f32-qc8w-gemm-relu.h | 38 ++ .../qd8-f16-qb4w-gemm-minmax.h | 77 +++ .../qd8-f16-qc4w-gemm-minmax.h | 180 ++++++ .../qd8-f16-qc8w-gemm-minmax.h | 126 ++++ .../qd8-f32-qb4w-gemm-minmax.h | 142 +++++ .../qd8-f32-qc4w-gemm-minmax.h | 397 ++++++++++++ .../qd8-f32-qc8w-gemm-minmax.h | 404 ++++++++++++ .../qd8-f32-qc8w-igemm-minmax.h | 401 ++++++++++++ tools/generate-gemm-header.py | 344 +++++++++++ tools/generate-gemm-test.py | 9 - 25 files changed, 3465 insertions(+), 438 deletions(-) create mode 100644 src/bf16-gemm/bf16-gemm-minmax1.h create mode 100644 tools/generate-gemm-header.py diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index 2495cd0bbcc..cd027f9ffd7 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -13,7 +13,6 @@ tools/generate-gemm-test.py --ukernel f16-f32acc-gemm-minmax --output-test test/ tools/generate-gemm-test.py --ukernel f32-gemm --output-test test/f32-gemm.cc & tools/generate-gemm-test.py --ukernel f32-gemm-relu --output-test test/f32-gemm-relu.cc & tools/generate-gemm-test.py --ukernel f32-gemm-minmax --output-test test/f32-gemm-minmax.cc & -### TODO tools/generate-gemm-test.py --ukernel f32-gemminc-minmax --output-test test/f32-gemminc-minmax.cc & tools/generate-gemm-test.py --ukernel f32-gemm-goi-minmax --output-test test/f32-gemm-goi-minmax.cc & diff --git a/src/bf16-gemm/bf16-gemm-minmax.h b/src/bf16-gemm/bf16-gemm-minmax.h index 6653817b0a6..d6d1537736a 100644 --- a/src/bf16-gemm/bf16-gemm-minmax.h +++ b/src/bf16-gemm/bf16-gemm-minmax.h @@ -1,33 +1,38 @@ // Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// Arguments are: -// XNN_GEMM_MINMAX(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) - + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, 8, false, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, 8, false, 2, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, 8, false, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, 8, false, 2, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, 8, false, 1, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, 8, false, 4, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, 8, false, 5, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, 8, false, 6, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, 8, false, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, 8, false, 2, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, 8, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, 8, false, 2, 4, 8, 1, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, 8, false, 1, 4, 8, 1, 1, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, 8, false, 2, 4, 8, 1, 2, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, 8, false, 3, 4, 8, 1, 3, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, 8, false, 4, 4, 8, 1, 4, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, 8, false, 5, 4, 8, 1, 5, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, 8, false, 1, 4, 8, 1, 1, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, 8, false, 2, 4, 8, 1, 2, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, 8, false, 3, 4, 8, 1, 3, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, 8, false, 4, 4, 8, 1, 4, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, 8, false, 5, 4, 8, 1, 5, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, 8, false, 1, 8, 2, 1, 1, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, 8, false, 4, 8, 2, 1, 4, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, 8, false, 5, 8, 2, 1, 5, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, 8, false, 6, 8, 2, 1, 6, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, 8, false, 1, 4, 8, 1, 1, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, 8, false, 2, 4, 8, 1, 2, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, 8, false, 3, 4, 8, 1, 3, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, 8, false, 4, 4, 8, 1, 4, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, 8, false, 5, 4, 8, 1, 5, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, 8, false, 1, 4, 8, 1, 1, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, 8, false, 2, 4, 8, 1, 2, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, 8, false, 3, 4, 8, 1, 3, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, 8, false, 4, 4, 8, 1, 4, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_bf16, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, 8, false, 5, 4, 8, 1, 5, false, xnn_bfloat16, struct xnn_bf16_default_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +#endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + diff --git a/src/bf16-gemm/bf16-gemm-minmax1.h b/src/bf16-gemm/bf16-gemm-minmax1.h new file mode 100644 index 00000000000..6653817b0a6 --- /dev/null +++ b/src/bf16-gemm/bf16-gemm-minmax1.h @@ -0,0 +1,33 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// Arguments are: +// XNN_GEMM_MINMAX(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, 8, false, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, 8, false, 2, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, 8, false, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, 8, false, 2, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, 8, false, 1, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, 8, false, 4, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, 8, false, 5, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, 8, false, 6, 8, 2, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, 8, false, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, 8, false, 2, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, 8, 1, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, 8, false, 2, 4, 8, 1, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, 8, false, 3, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, 8, false, 4, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(XNN_ARCH_ARM, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, 8, false, 5, 4, 8, 1, false, xnn_bf16_default_params, struct xnn_bf16_minmax_params, xnn_init_bf16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/src/f16-f32acc-gemm/f16-f32acc-gemm-minmax.h b/src/f16-f32acc-gemm/f16-f32acc-gemm-minmax.h index 6d41163251a..bdb7a25f002 100644 --- a/src/f16-f32acc-gemm/f16-f32acc-gemm-minmax.h +++ b/src/f16-f32acc-gemm/f16-f32acc-gemm-minmax.h @@ -1,18 +1,20 @@ // Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// Arguments are: -// XNN_GEMM_MINMAX(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) - + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, false, 1, 8, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, false, 1, 16, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast, 1, false, 3, 16, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast, 1, false, 4, 8, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast, 1, false, 4, 16, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast, 1, false, 5, 8, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast, 1, false, 5, 16, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast, 1, false, 6, 8, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast, 1, false, 7, 8, 1, 1, false, xnn_f16_default_params, struct xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, false, 1, 8, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast, 1, false, 3, 16, 1, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast, 1, false, 4, 8, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast, 1, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast, 1, false, 5, 8, 1, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast, 1, false, 5, 16, 1, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast, 1, false, 6, 8, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast, 1, false, 7, 8, 1, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_gemm_goi_w) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + diff --git a/src/f16-f32acc-igemm/f16-f32acc-igemm-minmax.h b/src/f16-f32acc-igemm/f16-f32acc-igemm-minmax.h index e69de29bb2d..f0bae08e725 100644 --- a/src/f16-f32acc-igemm/f16-f32acc-igemm-minmax.h +++ b/src/f16-f32acc-igemm/f16-f32acc-igemm-minmax.h @@ -0,0 +1,20 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, 1, false, 1, 8, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast, 1, false, 1, 16, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast, 1, false, 3, 16, 1, 1, 3, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast, 1, false, 4, 8, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast, 1, false, 4, 16, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast, 1, false, 5, 8, 1, 1, 5, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast, 1, false, 5, 16, 1, 1, 5, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast, 1, false, 6, 8, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast, 1, false, 7, 8, 1, 1, 7, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + diff --git a/src/f16-gemm/f16-gemm-minmax.h b/src/f16-gemm/f16-gemm-minmax.h index 4e78bf1e8aa..0208c216e55 100644 --- a/src/f16-gemm/f16-gemm-minmax.h +++ b/src/f16-gemm/f16-gemm-minmax.h @@ -1,60 +1,62 @@ // Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// Arguments are: -// XNN_GEMM_MINMAX(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) - -#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64, 4, false, 1, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64, 4, false, 4, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64, 4, false, 6, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64, 4, false, 8, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32, 2, false, 1, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, 4, false, 1, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32, 2, false, 4, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64, 4, false, 4, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55, 4, false, 6, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0, 4, false, 6, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75, 4, false, 6, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32, 2, false, 6, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64, 4, false, 6, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -#endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64, 4, false, 1, 8, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64, 4, false, 4, 8, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64, 4, false, 6, 8, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64, 4, false, 8, 8, 1, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32, 2, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, 4, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32, 2, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64, 4, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55, 4, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0, 4, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75, 4, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32, 2, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64, 4, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, 4, false, 1, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, 4, false, 4, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, 4, false, 6, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, 4, false, 8, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, 4, false, 1, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, 4, false, 4, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, 4, false, 6, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) -XNN_GEMM(XNN_ARCH_ARM, xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, 4, false, 8, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, 4, false, 1, 8, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, 4, false, 4, 8, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, 4, false, 6, 8, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, 4, false, 8, 8, 1, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, 4, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, 4, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, 4, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, 4, false, 8, 16, 1, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_u8) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_1x32__avx512fp16_broadcast, 1, false, 1, 32, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_4x32__avx512fp16_broadcast, 1, false, 4, 32, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_5x32__avx512fp16_broadcast, 1, false, 5, 32, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_6x32__avx512fp16_broadcast, 1, false, 6, 32, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_7x32__avx512fp16_broadcast, 1, false, 7, 32, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_8x32__avx512fp16_broadcast, 1, false, 8, 32, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_1x64__avx512fp16_broadcast, 1, false, 1, 64, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_4x64__avx512fp16_broadcast, 1, false, 4, 64, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_5x64__avx512fp16_broadcast, 1, false, 5, 64, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_6x64__avx512fp16_broadcast, 1, false, 6, 64, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_7x64__avx512fp16_broadcast, 1, false, 7, 64, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) -XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_8x64__avx512fp16_broadcast, 1, false, 8, 64, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_1x32__avx512fp16_broadcast, 1, false, 1, 32, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_4x32__avx512fp16_broadcast, 1, false, 4, 32, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_5x32__avx512fp16_broadcast, 1, false, 5, 32, 1, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_6x32__avx512fp16_broadcast, 1, false, 6, 32, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_7x32__avx512fp16_broadcast, 1, false, 7, 32, 1, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_8x32__avx512fp16_broadcast, 1, false, 8, 32, 1, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x32__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_1x64__avx512fp16_broadcast, 1, false, 1, 64, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_4x64__avx512fp16_broadcast, 1, false, 4, 64, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_5x64__avx512fp16_broadcast, 1, false, 5, 64, 1, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_6x64__avx512fp16_broadcast, 1, false, 6, 64, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_7x64__avx512fp16_broadcast, 1, false, 7, 64, 1, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_gemm_minmax_ukernel_8x64__avx512fp16_broadcast, 1, false, 8, 64, 1, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x64__scalar_int_u4) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, false, 1, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, 1, false, 4, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, 1, false, 5, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, 1, false, 6, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, 1, false, 7, 8, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, false, 1, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, 1, false, 3, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, 1, false, 4, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) -XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, 1, false, 5, 16, 1, 1, false, xnn_f16_default_params, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, false, 1, 8, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, 1, false, 4, 8, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, 1, false, 5, 8, 1, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, 1, false, 6, 8, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, 1, false, 7, 8, 1, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x8__avx2_u16) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, 1, false, 3, 16, 1, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, 1, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, 1, false, 5, 16, 1, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_x16_packw_gemm_goi_ukernel_x16__avx2_u16_prfm) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + diff --git a/src/f16-igemm/f16-igemm-minmax.h b/src/f16-igemm/f16-igemm-minmax.h index e69de29bb2d..4d7dd7a16f8 100644 --- a/src/f16-igemm/f16-igemm-minmax.h +++ b/src/f16-igemm/f16-igemm-minmax.h @@ -0,0 +1,58 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32, 2, false, 1, 16, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64, 4, false, 1, 16, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32, 2, false, 4, 16, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64, 4, false, 4, 16, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55, 2, false, 6, 16, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0, 4, false, 6, 16, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75, 4, false, 6, 16, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32, 2, false, 6, 16, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64, 4, false, 6, 16, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, 4, false, 1, 8, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, 4, false, 1, 16, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, 4, false, 4, 8, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, 4, false, 4, 16, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, 4, false, 6, 8, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, 4, false, 6, 16, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, 4, false, 8, 8, 1, 1, 8, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, 4, false, 8, 16, 1, 1, 8, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, 1, false, 1, 8, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, 1, false, 1, 16, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, 1, false, 3, 16, 1, 1, 3, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, 1, false, 4, 8, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, 1, false, 4, 16, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, 1, false, 5, 8, 1, 1, 5, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, 1, false, 5, 16, 1, 1, 5, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, 1, false, 6, 8, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, 1, false, 7, 8, 1, 1, 7, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_1x32__avx512fp16_broadcast, 1, false, 1, 32, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_4x32__avx512fp16_broadcast, 1, false, 4, 32, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_5x32__avx512fp16_broadcast, 1, false, 5, 32, 1, 1, 5, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_6x32__avx512fp16_broadcast, 1, false, 6, 32, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_7x32__avx512fp16_broadcast, 1, false, 7, 32, 1, 1, 7, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_8x32__avx512fp16_broadcast, 1, false, 8, 32, 1, 1, 8, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_1x64__avx512fp16_broadcast, 1, false, 1, 64, 1, 1, 1, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_4x64__avx512fp16_broadcast, 1, false, 4, 64, 1, 1, 4, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_5x64__avx512fp16_broadcast, 1, false, 5, 64, 1, 1, 5, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_6x64__avx512fp16_broadcast, 1, false, 6, 64, 1, 1, 6, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_7x64__avx512fp16_broadcast, 1, false, 7, 64, 1, 1, 7, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512fp16, xnn_f16_igemm_minmax_ukernel_8x64__avx512fp16_broadcast, 1, false, 8, 64, 1, 1, 8, true, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_f16_conv_goki_w) +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + diff --git a/src/f32-gemm/f32-gemm-goi-minmax.h b/src/f32-gemm/f32-gemm-goi-minmax.h index e69de29bb2d..193e94a17c5 100644 --- a/src/f32-gemm/f32-gemm-goi-minmax.h +++ b/src/f32-gemm/f32-gemm-goi-minmax.h @@ -0,0 +1,14 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_goi_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, None) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_goi_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, None) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_goi_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, None) +#endif // XNN_ARCH_ARM64 + diff --git a/src/f32-gemm/f32-gemm-minmax.h b/src/f32-gemm/f32-gemm-minmax.h index 5e93df991dd..3e0c00ee694 100644 --- a/src/f32-gemm/f32-gemm-minmax.h +++ b/src/f32-gemm/f32-gemm-minmax.h @@ -1,309 +1,319 @@ // Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// Arguments are: -// XNN_GEMM_MINMAX(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_ARM +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x4__asm_aarch32_vfp_ld64, 2, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75_prfm, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM -#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm, 2 false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7, 2, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75_prfm, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x4__asm_aarch32_vfp_ld64, 2, false, 4, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch32_neon_ld64, 2, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neon_ld128_acc2, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neon_ld128_acc2_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, 8, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm, 8, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, 8, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2_prfm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_prfm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc2, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc2_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x12__asm_aarch64_neonfma_cortex_a53, 4, false, 1, 12, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x1__asm_aarch64_neonfma_ld64, 2, false, 4, 1, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x1__asm_aarch64_neonfma_ld128, 4, false, 4, 1, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75, 8, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld128, 4, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75, 8, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x12__asm_aarch64_neonfma_cortex_a53, 4, false, 4, 12, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75, 8, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73, 8, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75, 8, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 -#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neon_ld128_acc2, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, 8, true, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2_prfm, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_prfm, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc2, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4_prfm, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x12__asm_aarch64_neonfma_cortex_a53, 4, true, 1, 12, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x1__asm_aarch64_neonfma_ld64, 2, false, 4, 1, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75, 8, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld128, 4, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x12__asm_aarch64_neonfma_cortex_a53, 4, true, 4, 12, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, true, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm, 4, ture, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55, 4, true, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75, 8, true, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, true, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neon_ld128_acc2_prfm, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm, 8, true, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, 8, true, 16, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, true, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4_prfm, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc2_prfm, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_prfm, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x1__asm_aarch64_neonfma_ld128, 4, false, 4, 1, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, 2, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55, 4, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75, 8, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, true, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld64, 2, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75, 8, true, 16, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53, 4, true, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73, 8, true, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld64, 2, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x16__neon_lane_ld128, 4, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_2x16__neon_lane_ld128, 4, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_3x16__neon_lane_ld128, 4, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x16__neon_lane_ld128, 4, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_5x16__neon_lane_ld128, 4, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_6x16__neon_lane_ld128, 4, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, 4, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_2x16__aarch64_neonfma_lane_ld128, 4, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_3x16__aarch64_neonfma_lane_ld128, 4, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128, 4, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_5x16__aarch64_neonfma_lane_ld128, 4, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x16__aarch64_neonfma_lane_ld128, 4, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x16__neon_lane_ld128, 4, false, 1, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x16__neon_lane_ld128, 4, false, 6, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 2, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 2, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 2, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, 2, false, 6, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_2x16__neon_lane_ld128, 4, false, 2, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_3x16__neon_lane_ld128, 4, false, 3, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x16__neon_lane_ld128, 4, false, 4, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_5x16__neon_lane_ld128, 4, false, 5, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld128, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64, 2, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld64, 2, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld128, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 2, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 2, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 4, false, 8, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld128, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_ARM64 -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_2x16__aarch64_neonfma_lane_ld128, 4, false, 2, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_3x16__aarch64_neonfma_lane_ld128, 4, false, 3, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x16__aarch64_neonfma_lane_ld128, 4, false, 6, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, 2, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, 2, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld64, 2, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_lane_ld64, 2, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, 2, false, 6, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64, 2, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, 2, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 4, false, 8, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, 4, false, 1, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128, 4, false, 4, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_5x16__aarch64_neonfma_lane_ld128, 4, false, 5, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_ARM64, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_ARM64 +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_lane_ld64, 2, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 2, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, 2, false, 6, 2, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, 2, false, 6, 2, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 4, false, 8, 8, 1, 4, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 4, false, 8, 8, 1, 4, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, 4, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, 1, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_3x8s4__sse, 4, false, 3, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, 4, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, 1, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_6x2c4__sse, 4, false, 6, 2, 4, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_6x8__sse_dup, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_6x8s4__sse, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_4x2c4__sse, 4, false, 4, 2, 4, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_5x8s4__sse, 4, false, 5, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_X86, xnn_f32_gemm_minmax_ukernel_6x8__sse_load1, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 1, false, 3, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 1, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_6x16__avx_broadcast, 1, false, 6, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, false, 1, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 1, false, 4, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 1, false, 5, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 1, false, 7, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, false, 1, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 1, false, 3, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 1, false, 5, 8, , 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 4, false, 1, 16, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, false, 3, 16, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 1, false, 4, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, false, 4, 16, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 1, false, 5, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 4, false, 5, 16, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_6x16__fma3_broadcast, 1, false, 6, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_6x16s4__fma3_broadcast, 4, false, 6, 16, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 1, false, 7, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 1, false, 8, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8s4__sse, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2c4__sse, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8s4__sse, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x2c4__sse, 4, false, 6, 2, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__sse_dup, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__sse_load1, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8s4__sse, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 1, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 1, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_6x16__avx_broadcast, 1, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 1, false, 7, 8, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 4, false, 1, 16, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 1, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, false, 3, 16, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, false, 4, 16, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 1, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 4, false, 5, 16, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_6x16__fma3_broadcast, 1, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_6x16s4__fma3_broadcast, 4, false, 6, 16, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 1, false, 7, 8, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, false, 1, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 1, false, 6, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 1, false, 7, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 1, false, 8, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_9x16__avx512f_broadcast, 1, false, 9, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_10x16__avx512f_broadcast, 1, false, 10, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_11x16__avx512f_broadcast, 1, false, 11, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_16x16__avx512f_broadcast, 1, false, 16, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_1x32__avx512f_broadcast, 1, false, 1, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_6x32__avx512f_broadcast, 1, false, 6, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_7x32__avx512f_broadcast, 1, false, 7, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_8x32__avx512f_broadcast, 1, false, 8, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_9x32__avx512f_broadcast, 1, false, 9, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_10x32__avx512f_broadcast, 1, false, 10, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_11x32__avx512f_broadcast, 1, false, 11, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_16x32__avx512f_broadcast, 1, false, 16, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_1x64__avx512f_broadcast, 1, false, 1, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_6x64__avx512f_broadcast, 1, false, 6, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_7x64__avx512f_broadcast, 1, false, 7, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_8x64__avx512f_broadcast, 1, false, 8, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_9x64__avx512f_broadcast, 1, false, 9, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_10x64__avx512f_broadcast, 1, false, 10, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_11x64__avx512f_broadcast, 1, false, 11, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_16x64__avx512f_broadcast, 1, false, 16, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 1, false, 4, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 1, false, 5, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_12x16__avx512f_broadcast, 1, false, 12, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_13x16__avx512f_broadcast, 1, false, 13, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_14x16__avx512f_broadcast, 1, false, 14, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_15x16__avx512f_broadcast, 1, false, 15, 16, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_4x32__avx512f_broadcast, 1, false, 4, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_5x32__avx512f_broadcast, 1, false, 5, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_12x32__avx512f_broadcast, 1, false, 12, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_13x32__avx512f_broadcast, 1, false, 13, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_14x32__avx512f_broadcast, 1, false, 14, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_15x32__avx512f_broadcast, 1, false, 15, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_4x64__avx512f_broadcast, 1, false, 4, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_5x64__avx512f_broadcast, 1, false, 5, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_12x64__avx512f_broadcast, 1, false, 12, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_13x64__avx512f_broadcast, 1, false, 13, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_14x64__avx512f_broadcast, 1, false, 14, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_15x64__avx512f_broadcast, 1, false, 15, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 1, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 1, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 1, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 1, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_1x32__avx512f_broadcast, 1, false, 1, 32, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_4x32__avx512f_broadcast, 1, false, 4, 32, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_5x32__avx512f_broadcast, 1, false, 5, 32, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_6x32__avx512f_broadcast, 1, false, 6, 32, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_7x32__avx512f_broadcast, 1, false, 7, 32, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_gemm_minmax_ukernel_8x32__avx512f_broadcast, 1, false, 8, 32, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) #endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 1, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 4, false, 3, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 4, false, 3, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, 4, false, 4, 2, 4, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, 4, false, 4, 2, 4, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 1, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 1, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 4, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 4, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 4, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 1, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 4, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 4, false, 5, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 4, false, 5, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, 1, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, 4, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, 1, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, 4, false, 3, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd, 4, false, 4, 2, 4, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, 4, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, 4, false, 5, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_minmax_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_minmax_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_minmax_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_minmax_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -// SCALAR -XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_gemm_minmax_ukernel_1x4v__rvv, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_gemm_minmax_ukernel_7x4v__rvv, 1, false, 7, 4, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV -XNN_GEMM(XNN_ARCH_RISCV, xnn_f32_gemm_minmax_ukernel_1x4v__rvv, 1, false, 1, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_RISCV, xnn_f32_gemm_minmax_ukernel_7x4v__rvv, 1, false, 7, 4, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_1x32__hvx_broadcast, 1, false, 1, 32, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_1x64__hvx_broadcast, 1, false, 1, 64, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_1x128__hvx_broadcast, 1, false, 1, 128, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_2x128__hvx_broadcast, 1, false, 2, 128, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_4x64__hvx_broadcast, 1, false, 4, 64, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_7x64__hvx_broadcast, 1, false, 7, 64, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_8x32__hvx_broadcast, 1, false, 8, 32, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_gemm_minmax_ukernel_16x32__hvx_broadcast, 1, false, 16, 32, 1, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -#if XNN_ENABLE_HVX && XNN_ARCH_HEXAGON -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_1x128__hvx_broadcast, 1, false, 1, 128, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_2x128__hvx_broadcast, 1, false, 2, 128, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_7x64__hvx_broadcast, 1, false, 7, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_8x32__hvx_broadcast, 1, false, 8, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_16x32__hvx_broadcast, 1, false, 16, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_1x32__hvx_broadcast, 1, false, 1, 32, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_1x64__hvx_broadcast, 1, false, 1, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_HEXAGON, xnn_f32_gemm_minmax_ukernel_4x64__hvx_broadcast, 1, 4, 64, 1, 1, false, float, struct xnn_f32_minmax_params , xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) -#endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON diff --git a/src/f32-gemm/f32-gemm-relu.h b/src/f32-gemm/f32-gemm-relu.h index b735aed7699..cad2659063b 100644 --- a/src/f32-gemm/f32-gemm-relu.h +++ b/src/f32-gemm/f32-gemm-relu.h @@ -1,62 +1,63 @@ // Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// Arguments are: -// XNN_GEMM_MINMAX(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) - + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat, 4, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat, 1, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat, 4, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd, 4, false, 3, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd, 4, false, 4, 2, 4, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat, 1, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd, 4, false, 5, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMSIMD, xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 5, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASMRELAXEDSIMD, xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_relu_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_relu_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_relu_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_WASM, xnn_f32_gemm_relu_ukernel_4x4__wasm, 1, false, 4, 4, , 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV -XNN_GEMM(XNN_ARCH_RISCV, xnn_f32_gemm_relu_ukernel_1x4v__rvv, 1, false, 1, 4, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(XNN_ARCH_RISCV, xnn_f32_gemm_relu_ukernel_7x4v__rvv, 1, false, 7, 4, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_gemm_relu_ukernel_1x4v__rvv, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_gemm_relu_ukernel_7x4v__rvv, 1, false, 7, 4, 1, 1, 7, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_gemm_goi_w) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -// SCALAR -XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) -XNN_GEMM(0, xnn_f32_gemm_relu_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, false, float, struct xnn_f32_minmax_params , NULL, xnn_pack_f32_gemm_goi_w) diff --git a/src/f32-igemm/f32-igemm-minmax.h b/src/f32-igemm/f32-igemm-minmax.h index e69de29bb2d..c4235995564 100644 --- a/src/f32-igemm/f32-igemm-minmax.h +++ b/src/f32-igemm/f32-igemm-minmax.h @@ -0,0 +1,303 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_ARM +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch32_neon_cortex_a53_prfm, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a7, 2, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a53_prfm, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a55, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_cortex_a75_prfm, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch32_neon_ld64, 2, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53, 8, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a53_prfm, 8, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75, 8, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_prfm, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x12__asm_aarch64_neonfma_cortex_a53, 4, false, 1, 12, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75, 8, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, 2, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a53_prfm, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a55, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75, 8, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld64, 2, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x12__asm_aarch64_neonfma_cortex_a53, 4, false, 4, 12, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75, 8, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_5x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a53_prfm, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a55, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a73, 8, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75, 8, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_cortex_a75_prfm, 8, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld64, 2, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_1x16__neon_lane_ld128, 4, false, 1, 16, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_2x16__neon_lane_ld128, 4, false, 2, 16, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_3x16__neon_lane_ld128, 4, false, 3, 16, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x16__neon_lane_ld128, 4, false, 4, 16, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_5x16__neon_lane_ld128, 4, false, 5, 16, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_6x16__neon_lane_ld128, 4, false, 6, 16, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, 4, false, 1, 16, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_2x16__aarch64_neonfma_lane_ld128, 4, false, 2, 16, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_3x16__aarch64_neonfma_lane_ld128, 4, false, 3, 16, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128, 4, false, 4, 16, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_5x16__aarch64_neonfma_lane_ld128, 4, false, 5, 16, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x16__aarch64_neonfma_lane_ld128, 4, false, 6, 16, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld128, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_1x8s4__neon, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, 2, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, 2, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x4__aarch64_neonfma_lane_ld64, 2, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x4__neon_lane_ld64, 2, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld64, 2, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64, 2, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64, 2, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64, 2, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_4x8s4__neon, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, 2, false, 6, 2, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64, 2, false, 6, 2, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64, 2, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64, 2, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64, 2, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64, 2, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_6x8s4__neon, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_igemm_minmax_ukernel_8x8s4__neon, 4, false, 8, 8, 1, 4, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma, 4, false, 8, 8, 1, 4, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8s4__sse, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__sse_dup, 4, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__sse_load1, 1, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8s4__sse, 4, false, 3, 8, 1, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x2c4__sse, 4, false, 4, 2, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__sse_dup, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__sse_load1, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8s4__sse, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__sse_dup, 4, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__sse_load1, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8s4__sse, 4, false, 5, 8, 1, 4, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x2c4__sse, 4, false, 6, 2, 4, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__sse_dup, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__sse_load1, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8s4__sse, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast, 1, false, 1, 16, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast, 1, false, 3, 16, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast, 1, false, 4, 16, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast, 1, false, 5, 16, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_6x16__avx_broadcast, 1, false, 6, 16, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast, 1, false, 7, 8, 1, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast, 1, false, 1, 16, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast, 4, false, 1, 16, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast, 1, false, 3, 16, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, false, 3, 16, 1, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast, 1, false, 4, 16, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, false, 4, 16, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast, 1, false, 5, 16, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast, 4, false, 5, 16, 1, 4, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_6x16__fma3_broadcast, 1, false, 6, 16, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_6x16s4__fma3_broadcast, 4, false, 6, 16, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast, 1, false, 7, 8, 1, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast, 1, false, 8, 8, 1, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, 1, false, 1, 16, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, 1, false, 4, 16, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, 1, false, 5, 16, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, 1, false, 6, 16, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, 1, false, 7, 16, 1, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, 1, false, 8, 16, 1, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_1x32__avx512f_broadcast, 1, false, 1, 32, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_4x32__avx512f_broadcast, 1, false, 4, 32, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_5x32__avx512f_broadcast, 1, false, 5, 32, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_6x32__avx512f_broadcast, 1, false, 6, 32, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_7x32__avx512f_broadcast, 1, false, 7, 32, 1, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512f, xnn_f32_igemm_minmax_ukernel_8x32__avx512f_broadcast, 1, false, 8, 32, 1, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 1, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 4, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 1, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 4, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, 4, false, 3, 8, 1, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, 4, false, 3, 8, 1, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm, 4, false, 4, 2, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86, 4, false, 4, 2, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 4, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 4, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, 4, false, 5, 8, 1, 4, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, 4, false, 5, 8, 1, 4, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, 1, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, 4, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, 4, false, 3, 8, 1, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd, 4, false, 4, 2, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, 4, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, 4, false, 5, 8, 1, 4, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_minmax_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_igemm_minmax_ukernel_1x4v__rvv, 1, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_igemm_minmax_ukernel_7x4v__rvv, 1, false, 7, 4, 1, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_1x32__hvx_broadcast, 1, false, 1, 32, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_1x64__hvx_broadcast, 1, false, 1, 64, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_1x128__hvx_broadcast, 1, false, 1, 128, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_2x128__hvx_broadcast, 1, false, 2, 128, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_4x64__hvx_broadcast, 1, false, 4, 64, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_7x64__hvx_broadcast, 1, false, 7, 64, 1, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_8x32__hvx_broadcast, 1, false, 8, 32, 1, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_hvx, xnn_f32_igemm_minmax_ukernel_16x32__hvx_broadcast, 1, false, 16, 32, 1, 1, 16, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_conv_goki_w) +#endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) + diff --git a/src/f32-igemm/f32-igemm-relu.h b/src/f32-igemm/f32-igemm-relu.h index e69de29bb2d..197e7308b97 100644 --- a/src/f32-igemm/f32-igemm-relu.h +++ b/src/f32-igemm/f32-igemm-relu.h @@ -0,0 +1,63 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat, 1, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat, 4, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd, 4, false, 3, 8, 1, 4, 3, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd, 4, false, 4, 2, 4, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat, 4, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd, 4, false, 5, 8, 1, 4, 5, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, 3, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, 5, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, 5, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, 6, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, 2, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(0, xnn_f32_igemm_relu_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_igemm_relu_ukernel_1x4v__rvv, 1, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_f32_igemm_relu_ukernel_7x4v__rvv, 1, false, 7, 4, 1, 1, 7, true, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_conv_goki_w) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + diff --git a/src/f32-ppmm/f32-ppmm-minmax.h b/src/f32-ppmm/f32-ppmm-minmax.h index e69de29bb2d..d6b38817df7 100644 --- a/src/f32-ppmm/f32-ppmm-minmax.h +++ b/src/f32-ppmm/f32-ppmm-minmax.h @@ -0,0 +1,59 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x8__asm_aarch64_neonfma_cortex_a75_prfm, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128_prfm, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_8x8__asm_aarch64_neonfma_cortex_a75, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_8x8__asm_aarch64_neonfma_cortex_a75_prfm, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_8x8__asm_aarch64_neonfma_ld128, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_8x8__asm_aarch64_neonfma_ld128_prfm, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x8__aarch64_neonfma, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x8__aarch64_neonfma_prfm, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_ppmm_minmax_ukernel_4x8__neon, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_ppmm_minmax_ukernel_4x8__neon_prfm, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x16__aarch64_neonfma, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_4x16__aarch64_neonfma_prfm, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_ppmm_minmax_ukernel_4x16__neon, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_ppmm_minmax_ukernel_4x16__neon_prfm, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_8x8__aarch64_neonfma, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_ppmm_minmax_ukernel_8x8__aarch64_neonfma_prfm, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_ppmm_minmax_ukernel_8x8__neon, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_ppmm_minmax_ukernel_8x8__neon_prfm, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(0, xnn_f32_ppmm_minmax_ukernel_4x8__sse, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, 1, false, 3, 3, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) +XNN_GEMM(0, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_gemm_goi_w) diff --git a/src/f32-qc4w-gemm/f32-qc4w-gemm-minmax.h b/src/f32-qc4w-gemm/f32-qc4w-gemm-minmax.h index e69de29bb2d..ec1e2f5bc9d 100644 --- a/src/f32-qc4w-gemm/f32-qc4w-gemm-minmax.h +++ b/src/f32-qc4w-gemm/f32-qc4w-gemm-minmax.h @@ -0,0 +1,111 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2_prfm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_prfm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__neon_dup_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__neon_lane_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc4w_gemm_minmax_ukernel_4x8__neon_dup_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc4w_gemm_minmax_ukernel_4x8__neon_lane_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_5x8__aarch64_neonfma_lane_ld64, 2, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc4w_gemm_minmax_ukernel_5x8__neon_lane_ld64, 2, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc4w_gemm_minmax_ukernel_6x8__neon_dup_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc4w_gemm_minmax_ukernel_6x8__neon_lane_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc4w_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc4w_gemm_minmax_ukernel_1x8__sse41_dup, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc4w_gemm_minmax_ukernel_3x8__sse41_dup, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc4w_gemm_minmax_ukernel_4x8__sse41_dup, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc4w_gemm_minmax_ukernel_5x8__sse41_dup, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc4w_gemm_minmax_ukernel_6x8__sse41_dup, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx_broadcast, 2, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_2x16__avx_broadcast, 2, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx_broadcast, 2, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_4x16__avx_broadcast, 2, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_5x16__avx_broadcast, 2, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_6x16__avx_broadcast, 2, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_7x16__avx_broadcast, 2, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc4w_gemm_minmax_ukernel_8x16__avx_broadcast, 2, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_1x16__fma3_broadcast, 2, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_2x16__fma3_broadcast, 2, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_3x16__fma3_broadcast, 2, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_4x16__fma3_broadcast, 2, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_5x16__fma3_broadcast, 2, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_6x16__fma3_broadcast, 2, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_7x16__fma3_broadcast, 2, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc4w_gemm_minmax_ukernel_8x16__fma3_broadcast, 2, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast, 2, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_2x16__avx2_broadcast, 2, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast, 2, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_4x16__avx2_broadcast, 2, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_5x16__avx2_broadcast, 2, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_6x16__avx2_broadcast, 2, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_7x16__avx2_broadcast, 2, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc4w_gemm_minmax_ukernel_8x16__avx2_broadcast, 2, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast, 2, false, 1, 32, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_2x32__avx512skx_broadcast, 2, false, 2, 32, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_3x32__avx512skx_broadcast, 2, false, 3, 32, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_4x32__avx512skx_broadcast, 2, false, 4, 32, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_5x32__avx512skx_broadcast, 2, false, 5, 32, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_6x32__avx512skx_broadcast, 2, false, 6, 32, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast, 2, false, 7, 32, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc4w_gemm_minmax_ukernel_8x32__avx512skx_broadcast, 2, false, 8, 32, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_1x4__wasm, 2, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_2x4__wasm, 2, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_4x2__wasm, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_4x4__wasm, 2, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_1x4__scalar, 2, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_2x4__scalar, 2, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_4x2__scalar, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc4w_gemm_minmax_ukernel_4x4__scalar, 2, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_f32_qc4w_gemm_goi_w) diff --git a/src/f32-qc8w-gemm/f32-qc8w-gemm-minmax.h b/src/f32-qc8w-gemm/f32-qc8w-gemm-minmax.h index e69de29bb2d..ceae077a206 100644 --- a/src/f32-qc8w-gemm/f32-qc8w-gemm-minmax.h +++ b/src/f32-qc8w-gemm/f32-qc8w-gemm-minmax.h @@ -0,0 +1,247 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neon_ld128_acc2, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neon_ld128_acc2_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc2_prfm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_acc4_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld64_prfm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc2, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc2_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc4_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_prfm, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x1__asm_aarch64_neonfma_ld64, 2, false, 4, 1, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x1__asm_aarch64_neonfma_ld128, 4, false, 4, 1, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld64, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x2__asm_aarch64_neonfma_ld128, 4, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__asm_aarch64_neonfma_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__aarch64_neonfma_lane_ld128, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__neon_dup_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__neon_lane_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x16__aarch64_neonfma_lane_ld128, 4, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x2__aarch64_neonfma_lane_ld64, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_4x2__neon_lane_ld64, 2, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__aarch64_neonfma_lane_ld128, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__neon_dup_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__neon_lane_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 2, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x16__aarch64_neonfma_lane_ld128, 4, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__aarch64_neonfma_lane_ld64, 2, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__neon_lane_ld64, 2, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_6x2__aarch64_neonfma_lane_ld64, 2, false, 6, 2, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_6x2__neon_lane_ld64, 2, false, 6, 2, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__neon_dup_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__neon_lane_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 2, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_1x8s4__neonfma, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_4x8s4__neonfma, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fma, xnn_f32_qc8w_gemm_minmax_ukernel_6x8s4__neonfma, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__sse41_dup, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__sse41_load1, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_1x8s4__sse41, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__sse41_dup, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__sse41_load1, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_3x8s4__sse41, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_4x2c4__sse41, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__sse41_dup, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__sse41_load1, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_4x8s4__sse41, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__sse41_dup, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__sse41_load1, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_5x8s4__sse41, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__sse41_dup, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__sse41_load1, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_f32_qc8w_gemm_minmax_ukernel_6x8s4__sse41, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx_broadcast, 1, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_2x16__avx_broadcast, 1, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_3x16__avx_broadcast, 1, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_4x16__avx_broadcast, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx_broadcast, 1, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_6x16__avx_broadcast, 1, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_7x16__avx_broadcast, 1, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_f32_qc8w_gemm_minmax_ukernel_8x16__avx_broadcast, 1, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_2x16__fma3_broadcast, 1, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_3x16__fma3_broadcast, 1, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_4x16__fma3_broadcast, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_5x16__fma3_broadcast, 1, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_6x16__fma3_broadcast, 1, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_7x16__fma3_broadcast, 1, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_fma3, xnn_f32_qc8w_gemm_minmax_ukernel_8x16__fma3_broadcast, 1, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_1x16s4__avx2_broadcast, 4, false, 1, 16, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_2x16s4__avx2_broadcast, 4, false, 2, 16, 1, 4, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_3x16s4__avx2_broadcast, 4, false, 3, 16, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__avx2_broadcast, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_4x16s4__avx2_broadcast, 4, false, 4, 16, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__avx2_broadcast, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_5x16s4__avx2_broadcast, 4, false, 5, 16, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__avx2_broadcast, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_6x16s4__avx2_broadcast, 4, false, 6, 16, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_7x8__avx2_broadcast, 1, false, 7, 8, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_8x8__avx2_broadcast, 1, false, 8, 8, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_2x16__avx2_broadcast, 1, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_3x16__avx2_broadcast, 1, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_4x16__avx2_broadcast, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast, 1, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_6x16__avx2_broadcast, 1, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_7x16__avx2_broadcast, 1, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_f32_qc8w_gemm_minmax_ukernel_8x16__avx2_broadcast, 1, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx512skx_broadcast, 1, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_1x32__avx512skx_broadcast, 1, false, 1, 32, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_2x16__avx512skx_broadcast, 1, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_2x32__avx512skx_broadcast, 1, false, 2, 32, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_3x16__avx512skx_broadcast, 1, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_3x32__avx512skx_broadcast, 1, false, 3, 32, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_4x16__avx512skx_broadcast, 1, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_4x32__avx512skx_broadcast, 1, false, 4, 32, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx512skx_broadcast, 1, false, 5, 16, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_5x32__avx512skx_broadcast, 1, false, 5, 32, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_6x16__avx512skx_broadcast, 1, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_6x32__avx512skx_broadcast, 1, false, 6, 32, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_7x16__avx512skx_broadcast, 1, false, 7, 16, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_7x32__avx512skx_broadcast, 1, false, 7, 32, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_8x16__avx512skx_broadcast, 1, false, 8, 16, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_f32_qc8w_gemm_minmax_ukernel_8x32__avx512skx_broadcast, 1, false, 8, 32, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8s4__wasmsimd_arm, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8s4__wasmsimd_x86, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_minmax_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_f32_qs8w_gemm_goi_w) diff --git a/src/f32-qc8w-gemm/f32-qc8w-gemm-relu.h b/src/f32-qc8w-gemm/f32-qc8w-gemm-relu.h index e69de29bb2d..a6fcfdc8681 100644 --- a/src/f32-qc8w-gemm/f32-qc8w-gemm-relu.h +++ b/src/f32-qc8w-gemm/f32-qc8w-gemm-relu.h @@ -0,0 +1,38 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat, 4, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma, 4, false, 1, 8, 1, 4, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat, 4, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma, 4, false, 3, 8, 1, 4, 3, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x2c4__wasmrelaxedsimd_fma, 4, false, 4, 2, 4, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, false, 4, 8, 1, 4, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat, 4, false, 5, 8, 1, 1, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma, 4, false, 5, 8, 1, 4, 5, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 1, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat, 4, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma, 4, false, 6, 8, 1, 4, 6, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x2__wasm, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x2__scalar, 1, false, 4, 2, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) +XNN_GEMM(0, xnn_f32_qc8w_gemm_relu_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, ((struct xnn_init_f32_default_params_fn) NULL), xnn_pack_f32_qs8w_gemm_goi_w) diff --git a/src/qd8-f16-qb4w-gemm/qd8-f16-qb4w-gemm-minmax.h b/src/qd8-f16-qb4w-gemm/qd8-f16-qb4w-gemm-minmax.h index e69de29bb2d..7702a84b6ab 100644 --- a/src/qd8-f16-qb4w-gemm/qd8-f16-qb4w-gemm-minmax.h +++ b/src/qd8-f16-qb4w-gemm/qd8-f16-qb4w-gemm-minmax.h @@ -0,0 +1,77 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + XNN_GEMM(0, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x2__scalar, 32, false, 1, 2, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x4__scalar, 32, false, 1, 4, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x8__scalar, 32, false, 1, 8, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x2__scalar, 32, false, 2, 2, 1, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x4__scalar, 32, false, 2, 4, 1, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x8__scalar, 32, false, 2, 8, 1, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x4__scalar, 32, false, 4, 4, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) + +#if XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x8c4__neondotfp16arith, 32, false, 1, 8, 4, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16c4__neondotfp16arith, 32, false, 1, 16, 4, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x8c4__neondotfp16arith, 32, false, 2, 8, 4, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x16c4__neondotfp16arith, 32, false, 2, 16, 4, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x8c4__neondotfp16arith, 32, false, 3, 8, 4, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x16c4__neondotfp16arith, 32, false, 3, 16, 4, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x8c4__neondotfp16arith, 32, false, 4, 8, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x16c4__neondotfp16arith, 32, false, 4, 16, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_5x8c4__neondotfp16arith, 32, false, 5, 8, 4, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_5x16c4__neondotfp16arith, 32, false, 5, 16, 4, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x8c4__neondotfp16arith, 32, false, 6, 8, 4, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x16c4__neondotfp16arith, 32, false, 6, 16, 4, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x8c8__avx2, 32, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x8c8__avx2, 32, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x8c8__avx2, 32, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x8c8__avx2, 32, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane, 32, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x16__neonfp16arith_mlal_lane, 32, false, 2, 16, 1, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x16__neonfp16arith_mlal_lane, 32, false, 3, 16, 1, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x16__neonfp16arith_mlal_lane, 32, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane, 32, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane_prfm, 32, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x16__neonfp16arith_mlal_lane_prfm, 32, false, 2, 16, 1, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x16__neonfp16arith_mlal_lane_prfm, 32, false, 3, 16, 1, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x16__neonfp16arith_mlal_lane_prfm, 32, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane_prfm, 32, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x8c8__neoni8mm, 32, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x16c8__neoni8mm, 32, false, 1, 16, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_1x32c8__neoni8mm, 32, false, 1, 32, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x8c8__neoni8mm, 32, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x16c8__neoni8mm, 32, false, 2, 16, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_2x32c8__neoni8mm, 32, false, 2, 32, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x8c8__neoni8mm, 32, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x16c8__neoni8mm, 32, false, 3, 16, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_3x32c8__neoni8mm, 32, false, 3, 32, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x8c8__neoni8mm, 32, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x16c8__neoni8mm, 32, false, 4, 16, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_4x32c8__neoni8mm, 32, false, 4, 32, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_5x8c8__neoni8mm, 32, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_5x16c8__neoni8mm, 32, false, 5, 16, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_5x32c8__neoni8mm, 32, false, 5, 32, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x8c8__neoni8mm, 32, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x16c8__neoni8mm, 32, false, 6, 16, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_6x32c8__neoni8mm, 32, false, 6, 32, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_7x8c8__neoni8mm, 32, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_7x16c8__neoni8mm, 32, false, 7, 16, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_7x32c8__neoni8mm, 32, false, 7, 32, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_8x8c8__neoni8mm, 32, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_8x16c8__neoni8mm, 32, false, 8, 16, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qb4w_gemm_minmax_ukernel_8x32c8__neoni8mm, 32, false, 8, 32, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + diff --git a/src/qd8-f16-qc4w-gemm/qd8-f16-qc4w-gemm-minmax.h b/src/qd8-f16-qc4w-gemm/qd8-f16-qc4w-gemm-minmax.h index e69de29bb2d..b5b1ab9f659 100644 --- a/src/qd8-f16-qc4w-gemm/qd8-f16-qc4w-gemm-minmax.h +++ b/src/qd8-f16-qc4w-gemm/qd8-f16-qc4w-gemm-minmax.h @@ -0,0 +1,180 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2_madd, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c8__avx2_madd, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2_madd, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__avx2_madd, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx2_madd, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c8__avx2_madd, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx2_madd, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx2_madd, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2_madd_prfm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c8__avx2_madd_prfm, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2_madd_prfm, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__avx2_madd_prfm, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx2_madd_prfm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c8__avx2_madd_prfm, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx2_madd_prfm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx2_madd_prfm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx_madd, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx256skx_madd, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx256skx_madd, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx_madd, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_9x8c8__avx256skx_madd, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_10x8c8__avx256skx_madd, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_12x8c8__avx256skx_madd, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_14x8c8__avx256skx_madd, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx_madd_prfm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx256skx_madd_prfm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx256skx_madd_prfm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx_madd_prfm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_9x8c8__avx256skx_madd_prfm, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_10x8c8__avx256skx_madd_prfm, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_12x8c8__avx256skx_madd_prfm, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_14x8c8__avx256skx_madd_prfm, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +#endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__neoni8mm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16c8__neoni8mm, 16, false, 1, 16, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x32c8__neoni8mm, 16, false, 1, 32, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c8__neoni8mm, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x16c8__neoni8mm, 16, false, 2, 16, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x32c8__neoni8mm, 16, false, 2, 32, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__neoni8mm, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x16c8__neoni8mm, 16, false, 3, 16, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x32c8__neoni8mm, 16, false, 3, 32, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__neoni8mm, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x16c8__neoni8mm, 16, false, 4, 16, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x32c8__neoni8mm, 16, false, 4, 32, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__neoni8mm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x16c8__neoni8mm, 16, false, 5, 16, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x32c8__neoni8mm, 16, false, 5, 32, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c8__neoni8mm, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x16c8__neoni8mm, 16, false, 6, 16, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x32c8__neoni8mm, 16, false, 6, 32, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__neoni8mm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x16c8__neoni8mm, 16, false, 7, 16, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x32c8__neoni8mm, 16, false, 7, 32, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__neoni8mm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x16c8__neoni8mm, 16, false, 8, 16, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x32c8__neoni8mm, 16, false, 8, 32, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c4__neondotfp16arith, 8, false, 1, 8, 4, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16c4__neondotfp16arith, 8, false, 1, 16, 4, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c4__neondotfp16arith, 8, false, 2, 8, 4, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x16c4__neondotfp16arith, 8, false, 2, 16, 4, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c4__neondotfp16arith, 8, false, 3, 8, 4, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x16c4__neondotfp16arith, 8, false, 3, 16, 4, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c4__neondotfp16arith, 8, false, 4, 8, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x16c4__neondotfp16arith, 8, false, 4, 16, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c4__neondotfp16arith, 8, false, 5, 8, 4, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x16c4__neondotfp16arith, 8, false, 5, 16, 4, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c4__neondotfp16arith, 8, false, 6, 8, 4, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x16c4__neondotfp16arith, 8, false, 6, 16, 4, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane, 8, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x16__neonfp16arith_mlal_lane_prfm, 8, false, 1, 16, 1, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x16__neonfp16arith_mlal_lane, 8, false, 2, 16, 1, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x16__neonfp16arith_mlal_lane_prfm, 8, false, 2, 16, 1, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x16__neonfp16arith_mlal_lane, 8, false, 3, 16, 1, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x16__neonfp16arith_mlal_lane_prfm, 8, false, 3, 16, 1, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x16__neonfp16arith_mlal_lane, 8, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x16__neonfp16arith_mlal_lane_prfm, 8, false, 4, 16, 1, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane, 8, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x16__neonfp16arith_mlal_lane_prfm, 8, false, 6, 16, 1, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnni, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnni, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnni, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnni, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnni, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnni, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnni, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnni, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnni_prfm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnni_prfm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnni_prfm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnni_prfm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnni_prfm, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnni_prfm, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnni_prfm, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnni_prfm, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX256VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnnigfni, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnnigfni, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnnigfni, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnnigfni, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnnigfni, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnnigfni, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnnigfni, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnnigfni, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnnigfni_prfm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnnigfni_prfm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnnigfni_prfm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnnigfni_prfm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnnigfni_prfm, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnnigfni_prfm, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnnigfni_prfm, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnnigfni_prfm, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX256VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c8__avxvnni, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avxvnni, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__avxvnni, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c8__avxvnni, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avxvnni, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avxvnni, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c8__avxvnni_prfm, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avxvnni_prfm, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__avxvnni_prfm, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c8__avxvnni_prfm, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avxvnni_prfm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avxvnni_prfm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2, 8, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c8__avx2, 8, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2, 8, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__avx2, 8, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx2, 8, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c8__avx2, 8, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx2, 8, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx2, 8, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx, 8, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_2x8c8__avx256skx, 8, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx256skx, 8, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_4x8c8__avx256skx, 8, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx256skx, 8, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_6x8c8__avx256skx, 8, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx256skx, 8, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx, 8, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + diff --git a/src/qd8-f16-qc8w-gemm/qd8-f16-qc8w-gemm-minmax.h b/src/qd8-f16-qc8w-gemm/qd8-f16-qc8w-gemm-minmax.h index e69de29bb2d..c0c38d1ccff 100644 --- a/src/qd8-f16-qc8w-gemm/qd8-f16-qc8w-gemm-minmax.h +++ b/src/qd8-f16-qc8w-gemm/qd8-f16-qc8w-gemm-minmax.h @@ -0,0 +1,126 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x64c4__avx512amx, 64, false, 1, 64, 4, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x64c4__avx512amx, 64, false, 7, 64, 4, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_16x64c4__avx512amx, 64, false, 16, 64, 4, 1, 16, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_16x64c4__avx512amx_prfm, 64, false, 16, 64, 4, 1, 16, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__neoni8mm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm, 16, false, 1, 16, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x32c8__neoni8mm, 16, false, 1, 32, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c8__neoni8mm, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x16c8__neoni8mm, 16, false, 2, 16, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x32c8__neoni8mm, 16, false, 2, 32, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__neoni8mm, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x16c8__neoni8mm, 16, false, 3, 16, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x32c8__neoni8mm, 16, false, 3, 32, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c8__neoni8mm, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm, 16, false, 4, 16, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x32c8__neoni8mm, 16, false, 4, 32, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__neoni8mm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x16c8__neoni8mm, 16, false, 5, 16, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x32c8__neoni8mm, 16, false, 5, 32, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_6x8c8__neoni8mm, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_6x16c8__neoni8mm, 16, false, 6, 16, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_6x32c8__neoni8mm, 16, false, 6, 32, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__neoni8mm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x16c8__neoni8mm, 16, false, 7, 16, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x32c8__neoni8mm, 16, false, 7, 32, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x8c8__neoni8mm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x16c8__neoni8mm, 16, false, 8, 16, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x32c8__neoni8mm, 16, false, 8, 32, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c4__neondotfp16arith, 4, false, 1, 8, 4, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x16c4__neondotfp16arith, 4, false, 1, 16, 4, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c4__neondotfp16arith, 4, false, 2, 8, 4, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x16c4__neondotfp16arith, 4, false, 2, 16, 4, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c4__neondotfp16arith, 4, false, 3, 8, 4, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x16c4__neondotfp16arith, 4, false, 3, 16, 4, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c4__neondotfp16arith, 4, false, 4, 8, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_cortex_a55, 16, false, 4, 16, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondotfp16arith_ld128, 16, false, 4, 16, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x16c4__neondotfp16arith, 4, false, 4, 16, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c4__neondotfp16arith, 4, false, 5, 8, 4, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x16c4__neondotfp16arith, 4, false, 5, 16, 4, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_6x8c4__neondotfp16arith, 4, false, 6, 8, 4, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_6x16c4__neondotfp16arith, 4, false, 6, 16, 4, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c2s4__neonfp16arith, 16, false, 1, 8, 2, 4, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c2s4__neonfp16arith, 16, false, 2, 8, 2, 4, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM) +XNN_GEMM(xnn_arch_arm_neon_dot_fp16_arith, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c4__asm_aarch32_neondotfp16arith_cortex_a55, 8, false, 4, 8, 4, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM) + +#if XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx256vnni, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avx256vnni, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avx256vnni, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x8c8__avx256vnni, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_9x8c8__avx256vnni, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_10x8c8__avx256vnni, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_12x8c8__avx256vnni, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_14x8c8__avx256vnni, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx256vnni_prfm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avx256vnni_prfm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avx256vnni_prfm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x8c8__avx256vnni_prfm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_9x8c8__avx256vnni_prfm, 16, false, 9, 8, 8, 1, 9, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_10x8c8__avx256vnni_prfm, 16, false, 10, 8, 8, 1, 10, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_12x8c8__avx256vnni_prfm, 16, false, 12, 8, 8, 1, 12, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_14x8c8__avx256vnni_prfm, 16, false, 14, 8, 8, 1, 14, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c8__avxvnni, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avxvnni, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c8__avxvnni, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_6x8c8__avxvnni, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avxvnni, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x8c8__avxvnni, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm, 16, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c8__avxvnni_prfm, 16, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avxvnni_prfm, 16, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c8__avxvnni_prfm, 16, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm, 16, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_6x8c8__avxvnni_prfm, 16, false, 6, 8, 8, 1, 6, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avxvnni_prfm, 16, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x8c8__avxvnni_prfm, 16, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2, 8, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_2x8c8__avx2, 8, false, 2, 8, 8, 1, 2, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2, 8, false, 3, 8, 8, 1, 3, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_4x8c8__avx2, 8, false, 4, 8, 8, 1, 4, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx256skx, 8, false, 1, 8, 8, 1, 1, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avx256skx, 8, false, 5, 8, 8, 1, 5, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avx256skx, 8, false, 7, 8, 8, 1, 7, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f16_qc8w_gemm_minmax_ukernel_8x8c8__avx256skx, 8, false, 8, 8, 8, 1, 8, false, xnn_float16, struct xnn_f16_default_params, xnn_init_f16_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + diff --git a/src/qd8-f32-qb4w-gemm/qd8-f32-qb4w-gemm-minmax.h b/src/qd8-f32-qb4w-gemm/qd8-f32-qb4w-gemm-minmax.h index e69de29bb2d..e229e937711 100644 --- a/src/qd8-f32-qb4w-gemm/qd8-f32-qb4w-gemm-minmax.h +++ b/src/qd8-f32-qb4w-gemm/qd8-f32-qb4w-gemm-minmax.h @@ -0,0 +1,142 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x2__scalar, 32, false, 1, 2, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4__scalar, 32, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x8__scalar, 32, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x2__scalar, 32, false, 2, 2, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4__scalar, 32, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x8__scalar, 32, false, 2, 8, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4__scalar, 32, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__avx_ld128, 32, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__avx_ld128, 32, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__avx_ld128, 32, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__avx_ld128, 32, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__avx_ld64, 32, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__avx_ld64, 32, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__avx_ld64, 32, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__avx_ld64, 32, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse2_ld128, 32, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse2_ld128, 32, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse2_ld128, 32, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse2_ld128, 32, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse2_ld64, 32, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse2_ld64, 32, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse2_ld64, 32, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse2_ld64, 32, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse41_ld128, 32, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse41_ld128, 32, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse41_ld128, 32, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse41_ld128, 32, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse41_ld64, 32, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse41_ld64, 32, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse41_ld64, 32, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse41_ld64, 32, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x8c4__neondot, 32, false, 1, 8, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c4__neondot, 32, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x8c4__neondot, 32, false, 2, 8, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x16c4__neondot, 32, false, 2, 16, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x8c4__neondot, 32, false, 3, 8, 4, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x16c4__neondot, 32, false, 3, 16, 4, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x8c4__neondot, 32, false, 4, 8, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x16c4__neondot, 32, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x8c4__neondot, 32, false, 5, 8, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c4__neondot, 32, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x8c4__neondot, 32, false, 6, 8, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x16c4__neondot, 32, false, 6, 16, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x8c8__avx2, 32, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x8c8__avx2, 32, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x8c8__avx2, 32, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x8c8__avx2, 32, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16__neon_mlal_lane, 32, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm, 32, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x16__neon_mlal_lane, 32, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x16__neon_mlal_lane_prfm, 32, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x16__neon_mlal_lane, 32, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x16__neon_mlal_lane_prfm, 32, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x16__neon_mlal_lane, 32, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm, 32, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x16__neon_mlal_lane, 32, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x16__neon_mlal_lane_prfm, 32, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x8c8__neoni8mm, 32, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__neoni8mm, 32, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x32c8__neoni8mm, 32, false, 1, 32, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x8c8__neoni8mm, 32, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x16c8__neoni8mm, 32, false, 2, 16, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x32c8__neoni8mm, 32, false, 2, 32, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x8c8__neoni8mm, 32, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x16c8__neoni8mm, 32, false, 3, 16, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x32c8__neoni8mm, 32, false, 3, 32, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x8c8__neoni8mm, 32, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x16c8__neoni8mm, 32, false, 4, 16, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x32c8__neoni8mm, 32, false, 4, 32, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x8c8__neoni8mm, 32, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__neoni8mm, 32, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x32c8__neoni8mm, 32, false, 5, 32, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x8c8__neoni8mm, 32, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x16c8__neoni8mm, 32, false, 6, 16, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_6x32c8__neoni8mm, 32, false, 6, 32, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x8c8__neoni8mm, 32, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__neoni8mm, 32, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x32c8__neoni8mm, 32, false, 7, 32, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x8c8__neoni8mm, 32, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__neoni8mm, 32, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x32c8__neoni8mm, 32, false, 8, 32, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni, 32, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnni, 32, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnni, 32, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnni, 32, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnni, 32, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnni, 32, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnni, 32, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnni, 32, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm, 32, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnni_prfm, 32, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm, 32, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm, 32, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnni_prfm, 32, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnni_prfm, 32, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnni_prfm, 32, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnni_prfm, 32, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni, 32, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni, 32, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni, 32, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni, 32, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni, 32, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni, 32, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni, 32, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni, 32, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm, 32, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni_prfm, 32, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni_prfm, 32, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni_prfm, 32, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni_prfm, 32, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni_prfm, 32, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni_prfm, 32, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qb4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm, 32, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qb4w_minmax_scalar_params, xnn_pack_qs8_qb4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + diff --git a/src/qd8-f32-qc4w-gemm/qd8-f32-qc4w-gemm-minmax.h b/src/qd8-f32-qc4w-gemm/qd8-f32-qc4w-gemm-minmax.h index e69de29bb2d..dad78361dd0 100644 --- a/src/qd8-f32-qc4w-gemm/qd8-f32-qc4w-gemm-minmax.h +++ b/src/qd8-f32-qc4w-gemm/qd8-f32-qc4w-gemm-minmax.h @@ -0,0 +1,397 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__ssse3_madd, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__ssse3_madd, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__ssse3_madd, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__ssse3_madd, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_madd, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_madd, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_madd, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_madd, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__ssse3_madd_prfm, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__ssse3_madd_prfm, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__ssse3_madd_prfm, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_ssse3, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__ssse3_madd_prfm, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_madd_prfm, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_madd_prfm, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_madd_prfm, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_madd_prfm, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx_madd, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx256skx_madd, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx256skx_madd, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx_madd, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x8c8__avx256skx_madd, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x8c8__avx256skx_madd, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x8c8__avx256skx_madd, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x8c8__avx256skx_madd, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx_madd_prfm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx256skx_madd_prfm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx256skx_madd_prfm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx_madd_prfm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x8c8__avx256skx_madd_prfm, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x8c8__avx256skx_madd_prfm, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x8c8__avx256skx_madd_prfm, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x8c8__avx256skx_madd_prfm, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +#endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2_madd, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx2_madd, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx2_madd, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__avx2_madd, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx2_madd, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c8__avx2_madd, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx2_madd, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx2_madd, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2_madd_prfm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx2_madd_prfm, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx2_madd_prfm, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__avx2_madd_prfm, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx2_madd_prfm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c8__avx2_madd_prfm, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx2_madd_prfm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx2_madd_prfm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512skx_madd, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512skx_madd, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512skx_madd, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512skx_madd, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512skx_madd, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512skx_madd, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512skx_madd, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512skx_madd, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512skx_madd, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512skx_madd_prfm, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512skx_madd_prfm, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512skx_madd_prfm, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512skx_madd_prfm, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512skx_madd_prfm, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512skx_madd_prfm, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512skx_madd_prfm, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512skx_madd_prfm, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512skx_madd_prfm, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_madd, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512skx_madd, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_madd, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512skx_madd, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512skx_madd, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512skx_madd, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512skx_madd, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512skx_madd, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_madd_prfm, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512skx_madd_prfm, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_madd_prfm, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512skx_madd_prfm, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512skx_madd_prfm, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512skx_madd_prfm, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512skx_madd_prfm, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512skx_madd_prfm, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4uw_gemm_goi_w) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x64c4__avx512amx, 64, false, 1, 64, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x64c4__avx512amx, 64, false, 7, 64, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_16x64c4__avx512amx, 64, false, 16, 64, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_16x64c4__avx512amx_prfm, 64, false, 16, 64, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x32c4__avx512amx, 64, false, 1, 32, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x32c4__avx512amx, 64, false, 7, 32, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_16x32c4__avx512amx, 64, false, 16, 32, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_16x32c4__avx512amx_prfm, 64, false, 16, 32, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512amx, 64, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512amx, 64, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_16x16c4__avx512amx, 64, false, 16, 16, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_16x16c4__avx512amx_prfm, 64, false, 16, 16, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c4__neondot, 8, false, 1, 8, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__neondot, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c4__neondot, 8, false, 2, 8, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x16c4__neondot, 8, false, 2, 16, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x16c4__neondot, 8, false, 3, 16, 4, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c4__neondot, 8, false, 4, 8, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__neondot, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c4__neondot, 8, false, 6, 8, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x16c4__neondot, 8, false, 6, 16, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__neoni8mm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__neoni8mm, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x32c8__neoni8mm, 16, false, 1, 32, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__neoni8mm, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x16c8__neoni8mm, 16, false, 2, 16, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x32c8__neoni8mm, 16, false, 2, 32, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__neoni8mm, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x16c8__neoni8mm, 16, false, 3, 16, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x32c8__neoni8mm, 16, false, 3, 32, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__neoni8mm, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c8__neoni8mm, 16, false, 4, 16, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x32c8__neoni8mm, 16, false, 4, 32, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__neoni8mm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__neoni8mm, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x32c8__neoni8mm, 16, false, 5, 32, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c8__neoni8mm, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x16c8__neoni8mm, 16, false, 6, 16, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x32c8__neoni8mm, 16, false, 6, 32, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__neoni8mm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__neoni8mm, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x32c8__neoni8mm, 16, false, 7, 32, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__neoni8mm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__neoni8mm, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x32c8__neoni8mm, 16, false, 8, 32, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16__neon_mlal_lane, 8, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm, 8, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x16__neon_mlal_lane, 8, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x16__neon_mlal_lane_prfm, 8, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x16__neon_mlal_lane, 8, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x16__neon_mlal_lane_prfm, 8, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16__neon_mlal_lane, 8, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm, 8, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x16__neon_mlal_lane, 8, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x16__neon_mlal_lane_prfm, 8, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512skx, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512skx, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512skx_prfm, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512skx_prfm, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnni, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnni, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnni, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnni, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnni, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnni, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnni, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnni, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnni, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnni_prfm, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnni_prfm, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnni_prfm, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnni_prfm, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnni_prfm, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnni_prfm, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnni_prfm, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnni_prfm, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnni_prfm, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnni, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnni, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnni, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnni, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnni, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnni, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnni, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnni_prfm, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnni_prfm, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnni_prfm, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnni_prfm, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnni_prfm, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnnigfni, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnnigfni, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnnigfni, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnnigfni, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnnigfni, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnnigfni, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnnigfni, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnnigfni, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnnigfni, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c4__avx512vnnigfni_prfm, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x16c4__avx512vnnigfni_prfm, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c4__avx512vnnigfni_prfm, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c4__avx512vnnigfni_prfm, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c4__avx512vnnigfni_prfm, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c4__avx512vnnigfni_prfm, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c4__avx512vnnigfni_prfm, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c4__avx512vnnigfni_prfm, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c4__avx512vnnigfni_prfm, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x16c8__avx512vnnigfni_prfm, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni_prfm, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x16c8__avx512vnnigfni_prfm, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x16c8__avx512vnnigfni_prfm, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x16c8__avx512vnnigfni_prfm, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x16c8__avx512vnnigfni_prfm, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x16c8__avx512vnnigfni_prfm, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX512VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnni, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnni, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnni, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnni, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnni, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnni, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnni, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnni, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnni_prfm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnni_prfm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnni_prfm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnni_prfm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnni_prfm, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnni_prfm, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnni_prfm, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnni_prfm, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX256VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnnigfni, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnnigfni, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnnigfni, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnnigfni, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnnigfni, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnnigfni, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnnigfni, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnnigfni, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256vnnigfni_prfm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx256vnnigfni_prfm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx256vnnigfni_prfm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256vnnigfni_prfm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_9x8c8__avx256vnnigfni_prfm, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_10x8c8__avx256vnnigfni_prfm, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_12x8c8__avx256vnnigfni_prfm, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnnigfni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_14x8c8__avx256vnnigfni_prfm, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX256VNNIGFNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avxvnni, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avxvnni, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__avxvnni, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c8__avxvnni, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avxvnni, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avxvnni, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avxvnni_prfm, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avxvnni_prfm, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__avxvnni_prfm, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c8__avxvnni_prfm, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avxvnni_prfm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avxvnni_prfm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx2, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx2, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__avx2, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx2, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c8__avx2, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx2, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx2, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx256skx, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8c8__avx256skx, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx256skx, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x8c8__avx256skx, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avx256skx, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x8c8__avx256skx, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx256skx, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x8c8__avx256skx, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld128, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__avx_ld128, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__avx_ld128, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld128, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__avx_ld64, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__avx_ld64, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__avx_ld64, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__avx_ld64, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld128, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_ld128, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld128, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_ld128, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse41_ld64, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse41_ld64, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse41_ld64, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse41_ld64, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld128, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse2_ld128, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse2_ld128, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld128, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__sse2_ld64, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__sse2_ld64, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__sse2_ld64, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__sse2_ld64, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4v__rvv, 2, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4v__rvv, 2, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4v__rvv, 2, false, 3, 4, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4v__rvv, 2, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x4v__rvv, 2, false, 5, 4, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_6x4v__rvv, 2, false, 6, 4, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x4v__rvv, 2, false, 7, 4, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_8x4v__rvv, 2, false, 8, 4, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__wasmsimd_dot16x2_ld64, 16, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4c8__wasmsimd_dot16x2_ld64, 16, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x4c8__wasmsimd_dot16x2_ld64, 16, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__wasmsimd_dot16x2_ld64, 16, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x1__scalar, 2, false, 1, 1, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x2__scalar, 2, false, 1, 2, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4__scalar, 2, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8__scalar, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x2__scalar, 2, false, 2, 2, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4__scalar, 2, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8__scalar, 2, false, 2, 8, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4__scalar, 2, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x2__wasm, 2, false, 1, 2, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4__wasm, 2, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8__wasm, 2, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x2__wasm, 2, false, 2, 2, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x4__wasm, 2, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_2x8__wasm, 2, false, 2, 8, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4__wasm, 2, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_qc4w_minmax_scalar_params, xnn_pack_qs8_qc4w_gemm_goi_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + diff --git a/src/qd8-f32-qc8w-gemm/qd8-f32-qc8w-gemm-minmax.h b/src/qd8-f32-qc8w-gemm/qd8-f32-qc8w-gemm-minmax.h index e69de29bb2d..ccb87ad8ebd 100644 --- a/src/qd8-f32-qc8w-gemm/qd8-f32-qc8w-gemm-minmax.h +++ b/src/qd8-f32-qc8w-gemm/qd8-f32-qc8w-gemm-minmax.h @@ -0,0 +1,404 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x64c4__avx512amx, 64, false, 1, 64, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x64c4__avx512amx, 64, false, 7, 64, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x64c4__avx512amx, 64, false, 16, 64, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x64c4__avx512amx_prfm, 64, false, 16, 64, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x32c4__avx512amx, 64, false, 1, 32, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__avx512amx, 64, false, 7, 32, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x32c4__avx512amx, 64, false, 16, 32, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x32c4__avx512amx_prfm, 64, false, 16, 32, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512amx, 64, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512amx, 64, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx, 64, false, 16, 16, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx_prfm, 64, false, 16, 16, 4, 1, 16, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55, 8, false, 4, 8, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__neoni8mm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__neoni8mm, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x32c8__neoni8mm, 16, false, 1, 32, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__neoni8mm, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16c8__neoni8mm, 16, false, 2, 16, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x32c8__neoni8mm, 16, false, 2, 32, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__neoni8mm, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16c8__neoni8mm, 16, false, 3, 16, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x32c8__neoni8mm, 16, false, 3, 32, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__neoni8mm, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c8__neoni8mm, 16, false, 4, 16, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x32c8__neoni8mm, 16, false, 4, 32, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__neoni8mm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c8__neoni8mm, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x32c8__neoni8mm, 16, false, 5, 32, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8c8__neoni8mm, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c8__neoni8mm, 16, false, 6, 16, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c8__neoni8mm, 16, false, 6, 32, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__neoni8mm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__neoni8mm, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c8__neoni8mm, 16, false, 7, 32, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c8__neoni8mm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c8__neoni8mm, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c8__neoni8mm, 16, false, 8, 32, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8__neon_mlal_lane, 8, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8__neon_mlal_lane_prfm, 8, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal, 16, false, 1, 8, 2, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__neondot, 4, false, 1, 8, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__aarch64_neondot_ld128, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__neondot_ld64, 8, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane, 8, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm, 8, false, 1, 16, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__neondot, 4, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__aarch64_neondot_ld128, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__neondot_ld64, 8, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8__neon_mlal_lane, 8, false, 2, 8, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8__neon_mlal_lane_prfm, 8, false, 2, 8, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal, 16, false, 2, 8, 2, 4, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c4__neondot, 4, false, 2, 8, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16__neon_mlal_lane, 8, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16__neon_mlal_lane_prfm, 8, false, 2, 16, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16c4__neondot, 4, false, 2, 16, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8__neon_mlal_lane, 8, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8__neon_mlal_lane_prfm, 8, false, 3, 8, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c4__neondot, 4, false, 3, 8, 4, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16__neon_mlal_lane, 8, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16__neon_mlal_lane_prfm, 8, false, 3, 16, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16c4__neondot, 4, false, 3, 16, 4, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8__neon_mlal_lane, 8, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8__neon_mlal_lane_prfm, 8, false, 4, 8, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__neondot, 4, false, 4, 8, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16__neon_mlal_lane, 8, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm, 8, false, 4, 16, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55, 16, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ld64, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ld128, 16, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__neondot, 4, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c4__neondot, 4, false, 5, 8, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c4__neondot, 4, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8__neon_mlal_lane, 8, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8__neon_mlal_lane_prfm, 8, false, 6, 8, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8c4__neondot, 4, false, 6, 8, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16__neon_mlal_lane, 8, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16__neon_mlal_lane_prfm, 8, false, 6, 16, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__neondot, 4, false, 6, 16, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld64, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse2_ld128, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld64, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__sse41_ld128, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse2_ld64, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse2_ld128, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse41_ld64, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__sse41_ld128, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld64, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse2_ld128, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld64, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__sse41_ld128, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse2_ld64, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse2_ld128, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse41_ld64, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__sse41_ld128, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld64, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__avx_ld128, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld64, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__avx_ld128, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__avx_ld64, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__avx_ld128, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__avx_ld64, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__avx_ld128, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2, 8, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__avx2, 8, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2, 8, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__avx2, 8, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx256skx, 8, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avx256skx, 8, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avx256skx, 8, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c8__avx256skx, 8, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512skx, 8, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c8__avx512skx, 8, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512skx, 8, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c8__avx512skx, 8, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm, 8, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c8__avx512skx_prfm, 8, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm, 8, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c8__avx512skx_prfm, 8, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512vnni, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__avx512vnni, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c4__avx512vnni, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512vnni, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__avx512vnni, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__avx512vnni, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__avx512vnni, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_12x16c4__avx512vnni, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_14x16c4__avx512vnni, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512vnni_prfm, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__avx512vnni_prfm, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c4__avx512vnni_prfm, 8, false, 5, 16, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512vnni_prfm, 8, false, 7, 16, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__avx512vnni_prfm, 8, false, 8, 16, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__avx512vnni_prfm, 8, false, 9, 16, 4, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__avx512vnni_prfm, 8, false, 10, 16, 4, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_12x16c4__avx512vnni_prfm, 8, false, 12, 16, 4, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_14x16c4__avx512vnni_prfm, 8, false, 14, 16, 4, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512vnni, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c8__avx512vnni, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512vnni, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c8__avx512vnni, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c8__avx512vnni, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c8__avx512vnni, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_12x16c8__avx512vnni, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_14x16c8__avx512vnni, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm, 16, false, 1, 16, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c8__avx512vnni_prfm, 16, false, 5, 16, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm, 16, false, 7, 16, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c8__avx512vnni_prfm, 16, false, 8, 16, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c8__avx512vnni_prfm, 16, false, 9, 16, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c8__avx512vnni_prfm, 16, false, 10, 16, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_12x16c8__avx512vnni_prfm, 16, false, 12, 16, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_14x16c8__avx512vnni_prfm, 16, false, 14, 16, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx256vnni, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avx256vnni, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avx256vnni, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c8__avx256vnni, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x8c8__avx256vnni, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x8c8__avx256vnni, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_12x8c8__avx256vnni, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_14x8c8__avx256vnni, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx256vnni_prfm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avx256vnni_prfm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avx256vnni_prfm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c8__avx256vnni_prfm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x8c8__avx256vnni_prfm, 16, false, 9, 8, 8, 1, 9, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x8c8__avx256vnni_prfm, 16, false, 10, 8, 8, 1, 10, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_12x8c8__avx256vnni_prfm, 16, false, 12, 8, 8, 1, 12, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_14x8c8__avx256vnni_prfm, 16, false, 14, 8, 8, 1, 14, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__avxvnni, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avxvnni, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__avxvnni, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8c8__avxvnni, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avxvnni, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c8__avxvnni, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm, 16, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__avxvnni_prfm, 16, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avxvnni_prfm, 16, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__avxvnni_prfm, 16, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm, 16, false, 5, 8, 8, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8c8__avxvnni_prfm, 16, false, 6, 8, 8, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avxvnni_prfm, 16, false, 7, 8, 8, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c8__avxvnni_prfm, 16, false, 8, 8, 8, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__avxvnni_u2_acc2, 16, false, 1, 8, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c4__avxvnni_u2_acc2, 16, false, 2, 8, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c4__avxvnni_u2_acc2, 16, false, 3, 8, 4, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__avxvnni_u2_acc2, 16, false, 4, 8, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c4__avxvnni_u2_acc2, 16, false, 5, 8, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8c4__avxvnni_u2_acc2, 16, false, 6, 8, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c4__avxvnni_u2_acc2, 16, false, 7, 8, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c4__avxvnni_u2_acc2, 16, false, 8, 8, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__avxvnni_u4_acc4, 32, false, 1, 8, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c4__avxvnni_u4_acc4, 32, false, 2, 8, 4, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c4__avxvnni_u4_acc4, 32, false, 3, 8, 4, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__avxvnni_u4_acc4, 32, false, 4, 8, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c4__avxvnni_u4_acc4, 32, false, 5, 8, 4, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x8c4__avxvnni_u4_acc4, 32, false, 6, 8, 4, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c4__avxvnni_u4_acc4, 32, false, 7, 8, 4, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x8c4__avxvnni_u4_acc4, 32, false, 8, 8, 4, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4v__rvv, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4v__rvv, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4v__rvv, 1, false, 3, 4, 1, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4v__rvv, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x4v__rvv, 1, false, 5, 4, 1, 1, 5, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x4v__rvv, 1, false, 6, 4, 1, 1, 6, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x4v__rvv, 1, false, 7, 4, 1, 1, 7, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_riscv_vector, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x4v__rvv, 1, false, 8, 4, 1, 1, 8, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c2__wasmsimd_dot16x2_ld64, 8, false, 1, 4, 2, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c2__wasmsimd_dot16x2_ld128, 8, false, 1, 4, 2, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 1, 4, 2, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 1, 4, 2, 4, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__wasmsimd_dot16x2_ld64, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__wasmsimd_dot16x2_ld128, 8, false, 1, 4, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c2__wasmsimd_dot16x2_ld64, 8, false, 2, 4, 2, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c2__wasmsimd_dot16x2_ld128, 8, false, 2, 4, 2, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 2, 4, 2, 4, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 2, 4, 2, 4, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__wasmsimd_dot16x2_ld64, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__wasmsimd_dot16x2_ld128, 8, false, 2, 4, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c2__wasmsimd_dot16x2_ld64, 8, false, 3, 4, 2, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c2__wasmsimd_dot16x2_ld128, 8, false, 3, 4, 2, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 3, 4, 2, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 3, 4, 2, 4, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__wasmsimd_dot16x2_ld64, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c8__wasmsimd_dot16x2_ld128, 8, false, 3, 4, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c2__wasmsimd_dot16x2_ld64, 8, false, 4, 4, 2, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c2__wasmsimd_dot16x2_ld128, 8, false, 4, 4, 2, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 4, 4, 2, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 4, 4, 2, 4, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__wasmsimd_dot16x2_ld64, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c8__wasmsimd_dot16x2_ld128, 8, false, 4, 4, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmsdot, 16, false, 1, 4, 16, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot, 16, false, 1, 4, 16, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot, 16, false, 1, 8, 16, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot, 16, false, 1, 8, 16, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot, 16, false, 2, 4, 16, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot, 16, false, 2, 4, 16, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot, 16, false, 2, 8, 16, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot, 16, false, 2, 8, 16, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot, 16, false, 3, 4, 16, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot, 16, false, 3, 4, 16, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot, 16, false, 3, 8, 16, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot, 16, false, 3, 8, 16, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot, 16, false, 4, 4, 16, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot, 16, false, 4, 4, 16, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot, 16, false, 4, 8, 16, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot, 16, false, 4, 8, 16, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__wasmusdot, 8, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__wasmusdot_u2, 8, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__wasmusdot, 8, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__wasmusdot_u2, 8, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__wasmusdot, 8, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__wasmusdot_u2, 8, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmusdot, 8, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmusdot_u2, 8, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__wasmsdot, 8, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__wasmsdot_u2, 8, false, 1, 8, 8, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__wasmsdot, 8, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c8__wasmsdot_u2, 8, false, 2, 8, 8, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__wasmsdot, 8, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__wasmsdot_u2, 8, false, 3, 8, 8, 1, 3, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmsdot, 8, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmsdot_u2, 8, false, 4, 8, 8, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2, 8, false, 1, 16, 4, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2, 8, false, 4, 16, 4, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__wasm, 1, false, 1, 2, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4__wasm, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8__wasm, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x2__wasm, 1, false, 2, 2, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4__wasm, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8__wasm, 1, false, 2, 8, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4__wasm, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__scalar, 1, false, 1, 2, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4__scalar, 1, false, 1, 4, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8__scalar, 1, false, 1, 8, 1, 1, 1, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x2__scalar, 1, false, 2, 2, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4__scalar, 1, false, 2, 4, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8__scalar, 1, false, 2, 8, 1, 1, 2, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4__scalar, 1, false, 4, 4, 1, 1, 4, false, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_gemm_goi_w) diff --git a/src/qd8-f32-qc8w-igemm/qd8-f32-qc8w-igemm-minmax.h b/src/qd8-f32-qc8w-igemm/qd8-f32-qc8w-igemm-minmax.h index e69de29bb2d..b17af5f4ec6 100644 --- a/src/qd8-f32-qc8w-igemm/qd8-f32-qc8w-igemm-minmax.h +++ b/src/qd8-f32-qc8w-igemm/qd8-f32-qc8w-igemm-minmax.h @@ -0,0 +1,401 @@ +// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + +#if XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__avx512amx, 64, false, 1, 16, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c4__avx512amx, 64, false, 7, 16, 4, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_16x16c4__avx512amx, 64, false, 16, 16, 4, 1, 16, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_16x16c4__avx512amx_prfm, 64, false, 16, 16, 4, 1, 16, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x32c4__avx512amx, 64, false, 1, 32, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x32c4__avx512amx, 64, false, 7, 32, 4, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_16x32c4__avx512amx, 64, false, 16, 32, 4, 1, 16, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_16x32c4__avx512amx_prfm, 64, false, 16, 32, 4, 1, 16, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x64c4__avx512amx, 64, false, 1, 64, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x64c4__avx512amx, 64, false, 7, 64, 4, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_16x64c4__avx512amx, 64, false, 16, 64, 4, 1, 16, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512amx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_16x64c4__avx512amx_prfm, 64, false, 16, 64, 4, 1, 16, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_AVX512AMX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x2__scalar, 2, false, 1, 2, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4__scalar, 2, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8__scalar, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x2__scalar, 2, false, 2, 2, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4__scalar, 2, false, 2, 4, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8__scalar, 2, false, 2, 8, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4__scalar, 2, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8__neon_mlal_lane, 8, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8__neon_mlal_lane_prfm, 8, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c2s4__neon_mlal, 16, false, 1, 8, 2, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c4__neondot, 4, false, 1, 8, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__aarch64_neondot_ld128, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__neondot_ld64, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__neoni8mm, 16, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16__neon_mlal_lane, 8, false, 1, 16, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16__neon_mlal_lane_prfm, 8, false, 1, 16, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__neondot, 4, false, 1, 16, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__aarch64_neondot_ld128, 8, false, 1, 16, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__neondot_ld64, 8, false, 1, 16, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__neoni8mm, 16, false, 1, 16, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x32c4__neondot, 4, false, 1, 32, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8__neon_mlal_lane, 8, false, 2, 8, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8__neon_mlal_lane_prfm, 8, false, 2, 8, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c2s4__neon_mlal, 16, false, 2, 8, 2, 4, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c4__neondot, 4, false, 2, 8, 4, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__neoni8mm, 16, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x16__neon_mlal_lane, 8, false, 2, 16, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x16__neon_mlal_lane_prfm, 8, false, 2, 16, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x16c4__neondot, 4, false, 2, 16, 4, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x16c8__neoni8mm, 16, false, 2, 16, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x32c4__neondot, 4, false, 2, 32, 4, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8__neon_mlal_lane, 8, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8__neon_mlal_lane_prfm, 8, false, 3, 8, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__neoni8mm, 16, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x16__neon_mlal_lane, 8, false, 3, 16, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x16__neon_mlal_lane_prfm, 8, false, 3, 16, 1, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x16c8__neoni8mm, 16, false, 3, 16, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8__neon_mlal_lane, 8, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8__neon_mlal_lane_prfm, 8, false, 4, 8, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c4__asm_aarch32_neondot_cortex_a55, 8, false, 4, 8, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c4__neondot, 4, false, 4, 8, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__neoni8mm, 16, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16__neon_mlal_lane, 8, false, 4, 16, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16__neon_mlal_lane_prfm, 8, false, 4, 16, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_cortex_a55, 16, false, 4, 16, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ld128, 16, false, 4, 16, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__neondot, 4, false, 4, 16, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c8__neoni8mm, 16, false, 4, 16, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x32c4__neondot, 4, false, 4, 32, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x8__neon_mlal_lane, 8, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x8__neon_mlal_lane_prfm, 8, false, 6, 8, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x8c4__neondot, 4, false, 6, 8, 4, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x8c8__neoni8mm, 16, false, 6, 8, 8, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x16__neon_mlal_lane, 8, false, 6, 16, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x16__neon_mlal_lane_prfm, 8, false, 6, 16, 1, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x16c4__neondot, 4, false, 6, 16, 4, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x16c8__neoni8mm, 16, false, 6, 16, 8, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x32c4__neondot, 4, false, 6, 32, 4, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c4__neondot, 4, false, 8, 8, 4, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c8__neoni8mm, 16, false, 8, 8, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c4__neondot, 4, false, 8, 16, 4, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_i8mm, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c8__neoni8mm, 16, false, 8, 16, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_I8MM && (XNN_ARCH_ARM64) + +#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +XNN_GEMM(xnn_arch_arm_neon_dot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x32c4__neondot, 4, false, 8, 32, 4, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64) + +#if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512skx, 8, false, 1, 16, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c8__avx512skx, 8, false, 5, 16, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512skx, 8, false, 7, 16, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c8__avx512skx, 8, false, 8, 16, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512skx_prfm, 8, false, 1, 16, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c8__avx512skx_prfm, 8, false, 5, 16, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512skx_prfm, 8, false, 7, 16, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c8__avx512skx_prfm, 8, false, 8, 16, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__avx512vnni, 8, false, 1, 16, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__avx512vnni, 8, false, 4, 16, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c4__avx512vnni, 8, false, 5, 16, 4, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c4__avx512vnni, 8, false, 7, 16, 4, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c4__avx512vnni, 8, false, 8, 16, 4, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x16c4__avx512vnni, 8, false, 9, 16, 4, 1, 9, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x16c4__avx512vnni, 8, false, 10, 16, 4, 1, 10, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x16c4__avx512vnni, 8, false, 12, 16, 4, 1, 12, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x16c4__avx512vnni, 8, false, 14, 16, 4, 1, 14, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__avx512vnni_prfm, 8, false, 1, 16, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__avx512vnni_prfm, 8, false, 4, 16, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c4__avx512vnni_prfm, 8, false, 5, 16, 4, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c4__avx512vnni_prfm, 8, false, 7, 16, 4, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c4__avx512vnni_prfm, 8, false, 8, 16, 4, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x16c4__avx512vnni_prfm, 8, false, 9, 16, 4, 1, 9, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x16c4__avx512vnni_prfm, 8, false, 10, 16, 4, 1, 10, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x16c4__avx512vnni_prfm, 8, false, 12, 16, 4, 1, 12, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x16c4__avx512vnni_prfm, 8, false, 14, 16, 4, 1, 14, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512vnni, 16, false, 1, 16, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c8__avx512vnni, 16, false, 5, 16, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512vnni, 16, false, 7, 16, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c8__avx512vnni, 16, false, 8, 16, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x16c8__avx512vnni, 16, false, 9, 16, 8, 1, 9, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x16c8__avx512vnni, 16, false, 10, 16, 8, 1, 10, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x16c8__avx512vnni, 16, false, 12, 16, 8, 1, 12, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x16c8__avx512vnni, 16, false, 14, 16, 8, 1, 14, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512vnni_prfm, 16, false, 1, 16, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x16c8__avx512vnni_prfm, 16, false, 5, 16, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512vnni_prfm, 16, false, 7, 16, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x16c8__avx512vnni_prfm, 16, false, 8, 16, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x16c8__avx512vnni_prfm, 16, false, 9, 16, 8, 1, 9, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x16c8__avx512vnni_prfm, 16, false, 10, 16, 8, 1, 10, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x16c8__avx512vnni_prfm, 16, false, 12, 16, 8, 1, 12, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx512vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x16c8__avx512vnni_prfm, 16, false, 14, 16, 8, 1, 14, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_AVX512VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx256vnni, 16, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avx256vnni, 16, false, 5, 8, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avx256vnni, 16, false, 7, 8, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c8__avx256vnni, 16, false, 8, 8, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x8c8__avx256vnni, 16, false, 9, 8, 8, 1, 9, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x8c8__avx256vnni, 16, false, 10, 8, 8, 1, 10, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x8c8__avx256vnni, 16, false, 12, 8, 8, 1, 12, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x8c8__avx256vnni, 16, false, 14, 8, 8, 1, 14, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx256vnni_prfm, 16, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avx256vnni_prfm, 16, false, 5, 8, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avx256vnni_prfm, 16, false, 7, 8, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c8__avx256vnni_prfm, 16, false, 8, 8, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_9x8c8__avx256vnni_prfm, 16, false, 9, 8, 8, 1, 9, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_10x8c8__avx256vnni_prfm, 16, false, 10, 8, 8, 1, 10, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_12x8c8__avx256vnni_prfm, 16, false, 12, 8, 8, 1, 12, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256vnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_14x8c8__avx256vnni_prfm, 16, false, 14, 8, 8, 1, 14, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_AVX256VNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni, 16, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__avxvnni, 16, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avxvnni, 16, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__avxvnni, 16, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni, 16, false, 5, 8, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x8c8__avxvnni, 16, false, 6, 8, 8, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avxvnni, 16, false, 7, 8, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c8__avxvnni, 16, false, 8, 8, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm, 16, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__avxvnni_prfm, 16, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avxvnni_prfm, 16, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__avxvnni_prfm, 16, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm, 16, false, 5, 8, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_6x8c8__avxvnni_prfm, 16, false, 6, 8, 8, 1, 6, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avxvnni_prfm, 16, false, 7, 8, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avxvnni, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c8__avxvnni_prfm, 16, false, 8, 8, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + +#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x2__wasm, 2, false, 1, 2, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4__wasm, 2, false, 1, 4, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8__wasm, 2, false, 1, 8, 1, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x2__wasm, 2, false, 2, 2, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4__wasm, 2, false, 2, 4, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8__wasm, 2, false, 2, 8, 1, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4__wasm, 2, false, 4, 4, 1, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot, 8, false, 1, 16, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot_u2, 8, false, 1, 16, 4, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot, 8, false, 4, 16, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot_u2, 8, false, 4, 16, 4, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmusdot, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmusdot_u2, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__wasmusdot, 8, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__wasmusdot_u2, 8, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__wasmusdot, 8, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__wasmusdot_u2, 8, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__wasmusdot, 8, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_usdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__wasmusdot_u2, 8, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmsdot, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmsdot_u2, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__wasmsdot, 8, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__wasmsdot_u2, 8, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__wasmsdot, 8, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__wasmsdot_u2, 8, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__wasmsdot, 8, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__wasmsdot_u2, 8, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c16__wasmsdot, 16, false, 1, 4, 16, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c16__wasmsdot, 16, false, 2, 4, 16, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c16__wasmsdot, 16, false, 3, 4, 16, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_wasm_sdot, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c16__wasmsdot, 16, false, 4, 4, 16, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2__wasmsimd_dot16x2_ld64, 8, false, 1, 4, 2, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2__wasmsimd_dot16x2_ld128, 8, false, 1, 4, 2, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 1, 4, 2, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 1, 4, 2, 4, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__wasmsimd_dot16x2_ld64, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__wasmsimd_dot16x2_ld128, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c2__wasmsimd_dot16x2_ld64, 8, false, 2, 4, 2, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c2__wasmsimd_dot16x2_ld128, 8, false, 2, 4, 2, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 2, 4, 2, 4, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 2, 4, 2, 4, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__wasmsimd_dot16x2_ld64, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__wasmsimd_dot16x2_ld128, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c2__wasmsimd_dot16x2_ld64, 8, false, 3, 4, 2, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c2__wasmsimd_dot16x2_ld128, 8, false, 3, 4, 2, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 3, 4, 2, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 3, 4, 2, 4, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__wasmsimd_dot16x2_ld64, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__wasmsimd_dot16x2_ld128, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c2__wasmsimd_dot16x2_ld64, 8, false, 4, 4, 2, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c2__wasmsimd_dot16x2_ld128, 8, false, 4, 4, 2, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, 8, false, 4, 4, 2, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, 8, false, 4, 4, 2, 4, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__wasmsimd_dot16x2_ld64, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__wasmsimd_dot16x2_ld128, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__avx_ld64, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__avx_ld128, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__sse2_ld64, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__sse2_ld128, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__sse41_ld64, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c8__sse41_ld128, 8, false, 1, 4, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__avx_ld64, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__avx_ld128, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__sse2_ld64, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__sse2_ld128, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__sse41_ld64, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x4c8__sse41_ld128, 8, false, 2, 4, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__avx_ld64, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__avx_ld128, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__sse2_ld64, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__sse2_ld128, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__sse41_ld64, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x4c8__sse41_ld128, 8, false, 3, 4, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__avx_ld64, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__avx_ld128, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__sse2_ld64, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(0, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__sse2_ld128, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__sse41_ld64, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_sse4_1, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x4c8__sse41_ld128, 8, false, 4, 4, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx2, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_2x8c8__avx2, 8, false, 2, 8, 8, 1, 2, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2, 8, false, 3, 8, 8, 1, 3, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx2, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__avx2, 8, false, 4, 8, 8, 1, 4, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + +#if XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx256skx, 8, false, 1, 8, 8, 1, 1, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avx256skx, 8, false, 5, 8, 8, 1, 5, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avx256skx, 8, false, 7, 8, 8, 1, 7, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +XNN_GEMM(xnn_arch_x86_avx256skx, xnn_qd8_f32_qc8w_igemm_minmax_ukernel_8x8c8__avx256skx, 8, false, 8, 8, 8, 1, 8, true, float, struct xnn_f32_default_params, xnn_init_f32_minmax_scalar_params, xnn_pack_qs8_conv_goki_w) +#endif // XNN_ENABLE_AVX256SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + diff --git a/tools/generate-gemm-header.py b/tools/generate-gemm-header.py new file mode 100644 index 00000000000..54652b3c156 --- /dev/null +++ b/tools/generate-gemm-header.py @@ -0,0 +1,344 @@ +import argparse +import codecs +import codecs +from collections import defaultdict +import io +import math +import os +import os +import re +import sys +import platform +from playsound import playsound + +import xnncommon +from xnncommon import _ARCH_TO_MACRO_MAP, _ISA_TO_MACRO_MAP +import yaml + +try: + _DATATYPE_TO_CTYPE_MAP = { + "s8": "int8_t", + "u8": "uint8_t", + "qs8": "int8_t", + "qu8": "uint8_t", + "s16": "int16_t", + "u16": "uint16_t", + "s32": "int32_t", + "u32": "uint32_t", + "s64": "int64_t", + "u64": "uint64_t", + "bf16": "xnn_bfloat16", + "f16": "xnn_float16", + "f32": "float", + } + + params_map = { + "clamp": "minmax", + "rndd": "rnd", + "rndne": "rnd", + "rndu": "rnd", + "rndz": "rnd", + "elu": "elu", + "lrelu": "lrelu", + "relu": "relu", + } + + yamls = { +# "bf16-gemm-minmax": "bf16-gemm", +# "f16-f32acc-gemm-minmax": "f16-f32acc-gemm", +# "f16-f32acc-igemm-minmax": "f16-f32acc-igemm", +# "f16-gemm-minmax": "f16-gemm", +# "f16-igemm-minmax": "f16-igemm", +# "f32-gemm-goi-minmax": "f32-gemm", +# "f32-gemm-minmax": "f32-gemm", +# "f32-gemm-relu": "f32-gemm", +# "f32-igemm-minmax": "f32-igemm", +# "f32-igemm-relu": "f32-igemm", +# "f32-qc4w-gemm-minmax": "f32-qc4w-gemm", +# "f32-qc8w-gemm-minmax": "f32-qc8w-gemm", +# "f32-qc8w-gemm-relu": "f32-qc8w-gemm", +# "f32-ppmm-minmax": "f32-ppmm", +# "qd8-f16-qb4w-gemm-minmax": "qd8-f16-qb4w-gemm", +# "qd8-f16-qc4w-gemm-minmax": "qd8-f16-qc4w-gemm", +# "qd8-f16-qc8w-gemm-minmax": "qd8-f16-qc8w-gemm", +# "qd8-f32-qb4w-gemm-minmax": "qd8-f32-qb4w-gemm", +# "qd8-f32-qc4w-gemm-minmax": "qd8-f32-qc4w-gemm", +# "qd8-f32-qc8w-gemm-minmax": "qd8-f32-qc8w-gemm", +# "qd8-f32-qc8w-igemm-minmax": "qd8-f32-qc8w-igemm", + "qs8-qc8w-gemm-minmax-fp32": "qs8-qc8w-gemm", + "qs8-qc8w-igemm-minmax-fp32": "qs8-qc8w-igemm", + "qu8-gemm-minmax-fp32": "qu8-gemm", + "qu8-gemm-minmax-rndnu": "qu8-gemm", + "qu8-igemm-minmax-fp32": "qu8-igemm", + "qu8-igemm-minmax-rndnu": "qu8-igemm", + "qp8-f32-qb4w-gemm-minmax": "qp8-f32-qb4w-gemm", + "qp8-f32-qc4w-gemm-minmax": "qp8-f32-qc4w-gemm", + } + + HEADER = """// Copyright 2023 Google LLC + // + // This source code is licensed under the BSD-style license found in the + // LICENSE file in the root directory of this source tree. + // Arguments are: + // XNN_GEMM(arch_flags, fn_name, k_block, is_pipelined, mr, nr, kr, sr, mr_packed, is_igemm, datatype, params_type, init_fn, pack_fn) + + """ + + def split_ukernel_name(name): + match = re.fullmatch( + r"xnn_((?:[a-z0-9]+_?)+)_(gemm|igemm|ppmm|goi)(_(minmax|relu|none)(_(fp32|rndnu|rndnu16|none))?)?_ukernel__(.+)", + name, + ) + + if match: + data_type = match.group(1) # Extract data type like 'qp8_f32_qb4w' + op = match.group(2) # Extract operation type + activation = match.group(3) or "" # Extract activation type (e.g., minmax, relu, etc.) + target_name = match.group(6) # Extract target name + + common_name = name.split("__")[0] + common_parts = common_name.split("_") + param_spec = common_parts[-1] + + if "s" in param_spec: + param_spec, sr = param_spec.split("s", 1) + sr = int(sr) + else: + sr = 1 + if "c" in param_spec: + param_spec, kr = param_spec.split("c", 1) + kr = int(kr) + else: + kr = 1 + if "v" in param_spec: + vector_tile = True + param_spec, _ = param_spec.split("v", 1) + else: + vector_tile = False + + mr, nr = map(int, param_spec.split("x")) + + arch, isa, assembly = xnncommon.parse_target_name(target_name) + + mr_packed = re.search(r"mstep([0-9]+)", target_name) + if mr_packed: + mr_packed = mr // int(mr_packed.group(1)) + else: + mr_packed = mr + + requantization = common_parts[-3] if len(common_parts) > 2 else None + if requantization not in ["fp32", "rndnu", "rndnu16", "none"]: + requantization = None + + print(f"Name: {name}, ISA: {isa}") + return data_type, mr, nr, kr, sr, mr_packed, vector_tile, requantization, op, activation, arch, isa, assembly + + # Fallback logic for unknown names + data_type = None + for key in _DATATYPE_TO_CTYPE_MAP.keys(): + if key in name: + data_type = key + break + + if not data_type: + data_type = "unknown" + + common_name, target_name = name.split("__", 1) + param_spec = common_name.split("_")[-1] + + if "s" in param_spec: + param_spec, sr = param_spec.split("s", 1) + sr = int(sr) + else: + sr = 1 + if "c" in param_spec: + param_spec, kr = param_spec.split("c", 1) + kr = int(kr) + else: + kr = 1 + if "v" in param_spec: + vector_tile = True + param_spec, _ = param_spec.split("v", 1) + else: + vector_tile = False + + mr, nr = map(int, param_spec.split("x")) + + arch, isa, assembly = xnncommon.parse_target_name(target_name) + + print(f"Name: {name}, ISA: {isa}") + return data_type, mr, nr, kr, sr, mr, vector_tile, "unknown", "", arch, isa, assembly + + isas = { + "v6": "xnn_arch_arm_v6", + "armsimd32": "xnn_arch_arm_v6", + "vfpv2": "xnn_arch_arm_vfpv2", + "vfpv3": "xnn_arch_arm_vfpv3", + "neon": "xnn_arch_arm_neon", + "neonfp16": "xnn_arch_arm_neon_fp16", + "neonfma": "xnn_arch_arm_neon_fma", + "neonv8": "xnn_arch_arm_neon_v8", + "fp16arith": "xnn_arch_arm_fp16_arith", + "neonfp16arith": "xnn_arch_arm_neon_fp16_arith", + "neondotfp16arith":"xnn_arch_arm_neon_dot_fp16_arith", + "neonbf16": "xnn_arch_arm_neon_bf16", + "neondot": "xnn_arch_arm_neon_dot", + "neon_i8mm": "xnn_arch_arm_neon_i8mm", + "neoni8mm": "xnn_arch_arm_neon_i8mm", + "sse": "0", + "sse2": "0", + "ssse3": "xnn_arch_x86_ssse3", + "sse41": "xnn_arch_x86_sse4_1", + "avx": "xnn_arch_x86_avx", + "f16c": "xnn_arch_x86_f16c", + "fma3": "xnn_arch_x86_fma3", + "avx2": "xnn_arch_x86_avx2", + "avx512f": "xnn_arch_x86_avx512f", + "avx512vbmi": "xnn_arch_x86_avx512vbmi", + "avx512skx": "xnn_arch_x86_avx512skx", + "avx512vnni": "xnn_arch_x86_avx512vnni", + "avx512vnnigfni": "xnn_arch_x86_avx512vnnigfni", + "avx512amx": "xnn_arch_x86_avx512amx", + "avx512fp16": "xnn_arch_x86_avx512fp16", + "avxvnni": "xnn_arch_x86_avxvnni", + "avxvnniint8": "xnn_arch_x86_avxvnniint8", + "avx256skx": "xnn_arch_x86_avx256skx", + "avx256vnni": "xnn_arch_x86_avx256vnni", + "avx256vnnigfni": "xnn_arch_x86_avx256vnnigfni", + "rvv": "xnn_arch_riscv_vector", + "rvvfp16arith": "xnn_arch_riscv_vector_fp16_arith", + "vlenb": "xnn_arch_riscv_vlenb", + # xnn_arch_vsx = 1 << 0, + # xnn_arch_vsx3 = 1 << 1, + # xnn_arch_mma = 1 << 2, + "is_x86": "xnn_arch_wasm_is_x86", + "wasmblendvps": "xnn_arch_wasm_blendvps", + "pshufb": "xnn_arch_wasm_pshufb", + "sdot": "xnn_arch_wasm_sdot", + "usdot": "xnn_arch_wasm_usdot", + "fma": "xnn_arch_wasm_fma", + "wasmpshufb": "xnn_arch_wasm_pshufb", + "wasmsdot": "xnn_arch_wasm_sdot", + "wasmusdot": "xnn_arch_wasm_usdot", + "wasmfma": "xnn_arch_wasm_fma", + "hvx": "xnn_arch_hvx", + "wasm": "0", + "wasmsimd": "0", + "wasm32": "0", + "wasmrelaxedsimd": "0", + None: "0", + } + + yamls_inverted = defaultdict(list) + + for i in yamls.items(): + yamls_inverted[i[1]].append(i[0]) + + files = [] + hdrs = [] + for i in yamls_inverted.items(): + for j in i[1]: + src_path = "/home/mcw/Documents/Google_Project/Internal_XNNPACK/src/" + i[0] + "/" + j + ".h" + dst = src_path + + hdrs.append(src_path) + files.append(j) + + output = HEADER + in_define = "" + + src = "/home/mcw/Documents/Google_Project/Internal_XNNPACK/test/" + j + ".yaml" + + with codecs.open(src, "r", encoding="utf-8") as spec_file: + spec_yaml = yaml.safe_load(spec_file) + if not isinstance(spec_yaml, list): + raise ValueError("expected a list of micro-kernels in the spec") + + for ukernel_spec in spec_yaml: + name = ukernel_spec["name"] + init_fn = ukernel_spec.get("init") + pack_fn = ukernel_spec.get("pack") + kblock = ukernel_spec.get("k-block") + + data_type, mr, nr, kr, sr, mr_packed, is_pipelined, op, activation, arch, isa, assembly = split_ukernel_name(name) + ctype = _DATATYPE_TO_CTYPE_MAP[data_type] + + guard = _ISA_TO_MACRO_MAP.get(isa, "") + isa = isas[isa] + arch = [_ARCH_TO_MACRO_MAP[i] for i in arch] + + if arch: + if guard != "": + define = "#if " + guard + " && (" + " || ".join(arch) + ")\n" + else: + define = "#if " + " || ".join(arch) + "\n" + else: + if guard != "": + define = "#if " + guard + "\n" + else: + define = "" + + if in_define != define: + if in_define != "": + output += "#endif // " + in_define[4:] + output += "\n" + output += define + in_define = define + + params_type = "struct xnn_" + data_type + "_" + ( + params_map[op] + if op in params_map + else "minmax" + if activation and "minmax" in activation + else "relu" + if activation and "relu" in activation + else "default" + ) + "_params" + + if not init_fn: + init_fn = "((" + params_type.replace("xnn_", "xnn_init_") + "_fn) NULL)" + if data_type.startswith("q") and "hswish" not in op: + params_type = "struct xnn_" + data_type + "_" + op.replace("c", "") + "_minmax_params" + params_type = params_type.replace("xnn_u64_default_params", "uint32_t") + init_fn = init_fn.replace("((xnn_init_u64_default_params_fn) NULL)", "NULL") + + is_igemm = "igemm" in name + if (is_pipelined == "True") : + pipelined = "true" + else : + pipelined = "false" + + output += "XNN_GEMM(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\n" % ( + isa, + name, + kblock, + pipelined, + mr, + nr, + kr, + sr, + mr_packed, + "true" if is_igemm else "false", + ctype, + params_type, + init_fn, + pack_fn, + ) + + if in_define != "": + output += "#endif // " + in_define[4:] + "\n" + + with codecs.open(dst, "w", encoding="utf-8") as output_file: + output_file.write(output) + + + print("MICROKERNEL_DEPS = [") + print(",\n".join([' "' + i + '"' for i in hdrs])) + print("]") + print(" ".join(files)) + +except Exception as e: + print("An error occurred:", e) + +finally: + playsound("/home/mcw/Downloads/sound.mp3") diff --git a/tools/generate-gemm-test.py b/tools/generate-gemm-test.py index cad6738f005..10a7a54a40a 100755 --- a/tools/generate-gemm-test.py +++ b/tools/generate-gemm-test.py @@ -686,15 +686,6 @@ def main(args): else: requantization = None - # if "__" in ukernel: - # common_name, target_name = ukernel.split("__", 1) - # else: - # common_name = ukernel - # target_name = "" - - # common_parts = common_name.split("_") - # param_spec = common_parts[-1] - nr_scale = "" # if "v" in param_spec: # ctype = {