noah-memset.patch

From 5bf0ab80573d66e4ae5d94b094659094336da90f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 24 May 2024 12:38:50 -0500
Subject: [PATCH] x86: Improve large memset perf with non-temporal stores
 [RHEL-29312]

Previously we use `rep stosb` for all medium/large memsets. This is
notably worse than non-temporal stores for large (above a
few MBs) memsets.
See:
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
For data using different stategies for large memset on ICX and SKX.

Using non-temporal stores can be up to 3x faster on ICX and 2x faster
on SKX. Historically, these numbers would not have been so good
because of the zero-over-zero writeback optimization that `rep stosb`
is able to do. But, the zero-over-zero writeback optimization has been
removed as a potential side-channel attack, so there is no longer any
good reason to only rely on `rep stosb` for large memsets. On the flip
size, non-temporal writes can avoid data in their RFO requests saving
memory bandwidth.

All of the other changes to the file are to re-organize the
code-blocks to maintain "good" alignment given the new code added in
the `L(stosb_local)` case.

The results from running the GLIBC memset benchmarks on TGL-client for
N=20 runs:

Geometric Mean across the suite New / Old EXEX256: 0.979
Geometric Mean across the suite New / Old EXEX512: 0.979
Geometric Mean across the suite New / Old AVX2   : 0.986
Geometric Mean across the suite New / Old SSE2   : 0.979

Most of the cases are essentially unchanged, this is mostly to show
that adding the non-temporal case didn't add any regressions to the
other cases.

The results on the memset-large benchmark suite on TGL-client for N=20
runs:

Geometric Mean across the suite New / Old EXEX256: 0.926
Geometric Mean across the suite New / Old EXEX512: 0.925
Geometric Mean across the suite New / Old AVX2   : 0.928
Geometric Mean across the suite New / Old SSE2   : 0.924

So roughly a 7.5% speedup. This is lower than what we see on servers
(likely because clients typically have faster single-core bandwidth so
saving bandwidth on RFOs is less impactful), but still advantageous.

Full test-suite passes on x86_64 w/ and w/o multiarch.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 .../multiarch/memset-vec-unaligned-erms.S     | 149 +++++++++++-------
 1 file changed, 91 insertions(+), 58 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 97839a2248..637caadb40 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -21,10 +21,13 @@
    2. If size is less than VEC, use integer register stores.
    3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
    4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
-   5. On machines ERMS feature, if size is greater or equal than
-      __x86_rep_stosb_threshold then REP STOSB will be used.
-   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
-      4 VEC stores and store 4 * VEC at a time until done.  */
+   5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
+      4 VEC stores and store 4 * VEC at a time until done.
+   6. On machines ERMS feature, if size is range
+	  [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+	  then REP STOSB will be used.
+   7. If size >= __x86_shared_non_temporal_threshold, use a
+	  non-temporal stores.  */
 
 #include <sysdep.h>
 
@@ -147,6 +150,41 @@ L(entry_from_wmemset):
 	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
+
+	/* If have AVX512 mask instructions put L(less_vec) close to
+	   entry as it doesn't take much space and is likely a hot target.  */
+#ifdef USE_LESS_VEC_MASK_STORE
+    /* Align to ensure the L(less_vec) logic all fits in 1x cache lines.  */
+	.p2align 6,, 47
+	.p2align 4
+L(less_vec):
+L(less_vec_from_wmemset):
+	/* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check. Note that we are using rax which is set in
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+	andl	$(PAGE_SIZE - 1), %edi
+	/* Check if VEC_SIZE store cross page. Mask stores suffer
+	   serious performance degradation when it has to fault suppress.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+	/* This is generally considered a cold target.  */
+	ja	L(cross_page)
+# if VEC_SIZE > 32
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+	kmovq	%rcx, %k1
+# else
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k1
+# endif
+	vmovdqu8 %VMM(0), (%rax){%k1}
+	VZEROUPPER_RETURN
+#endif
+
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
@@ -185,54 +223,6 @@ L(last_2x_vec):
 #endif
 	VZEROUPPER_RETURN
 
-	/* If have AVX512 mask instructions put L(less_vec) close to
-	   entry as it doesn't take much space and is likely a hot target.
-	 */
-#ifdef USE_LESS_VEC_MASK_STORE
-	.p2align 4,, 10
-L(less_vec):
-L(less_vec_from_wmemset):
-	/* Less than 1 VEC.  */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-#  error Unsupported VEC_SIZE!
-# endif
-	/* Clear high bits from edi. Only keeping bits relevant to page
-	   cross check. Note that we are using rax which is set in
-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
-	andl	$(PAGE_SIZE - 1), %edi
-	/* Check if VEC_SIZE store cross page. Mask stores suffer
-	   serious performance degradation when it has to fault suppress.
-	 */
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
-	/* This is generally considered a cold target.  */
-	ja	L(cross_page)
-# if VEC_SIZE > 32
-	movq	$-1, %rcx
-	bzhiq	%rdx, %rcx, %rcx
-	kmovq	%rcx, %k1
-# else
-	movl	$-1, %ecx
-	bzhil	%edx, %ecx, %ecx
-	kmovd	%ecx, %k1
-# endif
-	vmovdqu8 %VMM(0), (%rax){%k1}
-	VZEROUPPER_RETURN
-
-# if defined USE_MULTIARCH && IS_IN (libc)
-	/* Include L(stosb_local) here if including L(less_vec) between
-	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
-	   L(stosb_more_2x_vec) target.  */
-	.p2align 4,, 10
-L(stosb_local):
-	movzbl	%sil, %eax
-	mov	%RDX_LP, %RCX_LP
-	mov	%RDI_LP, %RDX_LP
-	rep	stosb
-	mov	%RDX_LP, %RAX_LP
-	VZEROUPPER_RETURN
-# endif
-#endif
-
 #if defined USE_MULTIARCH && IS_IN (libc)
 	.p2align 4
 L(stosb_more_2x_vec):
@@ -318,21 +308,33 @@ L(return_vzeroupper):
 	ret
 #endif
 
-	.p2align 4,, 10
-#ifndef USE_LESS_VEC_MASK_STORE
-# if defined USE_MULTIARCH && IS_IN (libc)
+#ifdef USE_WITH_AVX2
+	.p2align 4
+#else
+	.p2align 4,, 4
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 	   range for 2-byte jump encoding.  */
 L(stosb_local):
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	jae	L(nt_memset)
 	movzbl	%sil, %eax
 	mov	%RDX_LP, %RCX_LP
 	mov	%RDI_LP, %RDX_LP
 	rep	stosb
+# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
+	/* Use xchg to save 1-byte (this helps align targets below).  */
+	xchg	%RDX_LP, %RAX_LP
+# else
 	mov	%RDX_LP, %RAX_LP
-	VZEROUPPER_RETURN
 # endif
+	VZEROUPPER_RETURN
+#endif
+#ifndef USE_LESS_VEC_MASK_STORE
 	/* Define L(less_vec) only if not otherwise defined.  */
-	.p2align 4
+	.p2align 4,, 12
 L(less_vec):
 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
 	   xmm). This is only does anything for AVX2.  */
@@ -423,4 +425,35 @@ L(between_2_3):
 	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
 #endif
 	ret
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+# ifdef USE_WITH_AVX512
+	/* Force align so the loop doesn't cross a cache-line.  */
+	.p2align 4
+# endif
+	.p2align 4,, 7
+    /* Memset using non-temporal stores.  */
+L(nt_memset):
+	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdi)
+	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
+    /* Align DST.  */
+	orq	$(VEC_SIZE * 1 - 1), %rdi
+	incq	%rdi
+	.p2align 4,, 7
+L(nt_loop):
+	VMOVNT	%VMM(0), (VEC_SIZE * 0)(%rdi)
+	VMOVNT	%VMM(0), (VEC_SIZE * 1)(%rdi)
+	VMOVNT	%VMM(0), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VMM(0), (VEC_SIZE * 3)(%rdi)
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rdx, %rdi
+	jb	L(nt_loop)
+	sfence
+	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rdx)
+	VZEROUPPER_RETURN
+#endif
+
+END(MEMSET_SYMBOL(__memset, unaligned_erms))
-- 
2.43.0