x86: Improve large memset perf with non-temporal stores [RHEL-29312]

Previously we use `rep stosb` for all medium/large memsets. This is notably worse than non-temporal stores for large (above a few MBs) memsets. See: https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing For data using different stategies for large memset on ICX and SKX. Using non-temporal stores can be up to 3x faster on ICX and 2x faster on SKX. Historically, these numbers would not have been so good because of the zero-over-zero writeback optimization that `rep stosb` is able to do. But, the zero-over-zero writeback optimization has been removed as a potential side-channel attack, so there is no longer any good reason to only rely on `rep stosb` for large memsets. On the flip size, non-temporal writes can avoid data in their RFO requests saving memory bandwidth. All of the other changes to the file are to re-organize the code-blocks to maintain "good" alignment given the new code added in the `L(stosb_local)` case. The results from running the GLIBC memset benchmarks on TGL-client for N=20 runs: Geometric Mean across the suite New / Old EXEX256: 0.979 Geometric Mean across the suite New / Old EXEX512: 0.979 Geometric Mean across the suite New / Old AVX2 : 0.986 Geometric Mean across the suite New / Old SSE2 : 0.979 Most of the cases are essentially unchanged, this is mostly to show that adding the non-temporal case didn't add any regressions to the other cases. The results on the memset-large benchmark suite on TGL-client for N=20 runs: Geometric Mean across the suite New / Old EXEX256: 0.926 Geometric Mean across the suite New / Old EXEX512: 0.925 Geometric Mean across the suite New / Old AVX2 : 0.928 Geometric Mean across the suite New / Old SSE2 : 0.924 So roughly a 7.5% speedup. This is lower than what we see on servers (likely because clients typically have faster single-core bandwidth so saving bandwidth on RFOs is less impactful), but still advantageous. Full test-suite passes on x86_64 w/ and w/o multiarch. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2024-05-24 12:38:50 -05:00 · 2024-05-24 12:38:50 -05:00 · 5bf0ab8057
parent 53f9d74322
commit 5bf0ab8057
1 changed files with 91 additions and 58 deletions
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@ -21,10 +21,13 @@
   2. If size is less than VEC, use integer register stores.
   3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
   4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
-   5. On machines ERMS feature, if size is greater or equal than
+   5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
-      __x86_rep_stosb_threshold then REP STOSB will be used.
+      4 VEC stores and store 4 * VEC at a time until done.
-   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+   6. On machines ERMS feature, if size is range
-      4 VEC stores and store 4 * VEC at a time until done.  */
+	  [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
 	  then REP STOSB will be used.
   7. If size >= __x86_shared_non_temporal_threshold, use a
 	  non-temporal stores.  */
 #include <sysdep.h>
@ -147,6 +150,41 @@ L(entry_from_wmemset):
 	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
 	/* If have AVX512 mask instructions put L(less_vec) close to
 	   entry as it doesn't take much space and is likely a hot target.  */
 #ifdef USE_LESS_VEC_MASK_STORE
    /* Align to ensure the L(less_vec) logic all fits in 1x cache lines.  */
 	.p2align 6,, 47
 	.p2align 4
 L(less_vec):
 L(less_vec_from_wmemset):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
 # endif
 	/* Clear high bits from edi. Only keeping bits relevant to page
 	   cross check. Note that we are using rax which is set in
 	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
 	andl	$(PAGE_SIZE - 1), %edi
 	/* Check if VEC_SIZE store cross page. Mask stores suffer
 	   serious performance degradation when it has to fault suppress.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
 	/* This is generally considered a cold target.  */
 	ja	L(cross_page)
 # if VEC_SIZE > 32
 	movq	$-1, %rcx
 	bzhiq	%rdx, %rcx, %rcx
 	kmovq	%rcx, %k1
 # else
 	movl	$-1, %ecx
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
 	vmovdqu8 %VMM(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
@ -185,54 +223,6 @@ L(last_2x_vec):
 #endif
 	VZEROUPPER_RETURN
 	/* If have AVX512 mask instructions put L(less_vec) close to
 	   entry as it doesn't take much space and is likely a hot target.
 	 */
 #ifdef USE_LESS_VEC_MASK_STORE
 	.p2align 4,, 10
 L(less_vec):
 L(less_vec_from_wmemset):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
 # endif
 	/* Clear high bits from edi. Only keeping bits relevant to page
 	   cross check. Note that we are using rax which is set in
 	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
 	andl	$(PAGE_SIZE - 1), %edi
 	/* Check if VEC_SIZE store cross page. Mask stores suffer
 	   serious performance degradation when it has to fault suppress.
 	 */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
 	/* This is generally considered a cold target.  */
 	ja	L(cross_page)
 # if VEC_SIZE > 32
 	movq	$-1, %rcx
 	bzhiq	%rdx, %rcx, %rcx
 	kmovq	%rcx, %k1
 # else
 	movl	$-1, %ecx
 	bzhil	%edx, %ecx, %ecx
 	kmovd	%ecx, %k1
 # endif
 	vmovdqu8 %VMM(0), (%rax){%k1}
 	VZEROUPPER_RETURN
 # if defined USE_MULTIARCH && IS_IN (libc)
 	/* Include L(stosb_local) here if including L(less_vec) between
 	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
 	   L(stosb_more_2x_vec) target.  */
 	.p2align 4,, 10
 L(stosb_local):
 	movzbl	%sil, %eax
 	mov	%RDX_LP, %RCX_LP
 	mov	%RDI_LP, %RDX_LP
 	rep	stosb
 	mov	%RDX_LP, %RAX_LP
 	VZEROUPPER_RETURN
 # endif
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 	.p2align 4
 L(stosb_more_2x_vec):
@ -318,21 +308,33 @@ L(return_vzeroupper):
 	ret
 #endif
-	.p2align 4,, 10
+#ifdef USE_WITH_AVX2
-#ifndef USE_LESS_VEC_MASK_STORE
+	.p2align 4
-# if defined USE_MULTIARCH && IS_IN (libc)
+#else
 	.p2align 4,, 4
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 	   range for 2-byte jump encoding.  */
 L(stosb_local):
 	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	jae	L(nt_memset)
 	movzbl	%sil, %eax
 	mov	%RDX_LP, %RCX_LP
 	mov	%RDI_LP, %RDX_LP
 	rep	stosb
 # if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
 	/* Use xchg to save 1-byte (this helps align targets below).  */
 	xchg	%RDX_LP, %RAX_LP
 # else
 	mov	%RDX_LP, %RAX_LP
 	VZEROUPPER_RETURN
 # endif
 	VZEROUPPER_RETURN
 #endif
 #ifndef USE_LESS_VEC_MASK_STORE
 	/* Define L(less_vec) only if not otherwise defined.  */
-	.p2align 4
+	.p2align 4,, 12
 L(less_vec):
 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
 	   xmm). This is only does anything for AVX2.  */
@ -423,4 +425,35 @@ L(between_2_3):
 	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
 #endif
 	ret
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
+
 #if defined USE_MULTIARCH && IS_IN (libc)
 # ifdef USE_WITH_AVX512
 	/* Force align so the loop doesn't cross a cache-line.  */
 	.p2align 4
 # endif
 	.p2align 4,, 7
    /* Memset using non-temporal stores.  */
 L(nt_memset):
 	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdi)
 	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
    /* Align DST.  */
 	orq	$(VEC_SIZE * 1 - 1), %rdi
 	incq	%rdi
 	.p2align 4,, 7
 L(nt_loop):
 	VMOVNT	%VMM(0), (VEC_SIZE * 0)(%rdi)
 	VMOVNT	%VMM(0), (VEC_SIZE * 1)(%rdi)
 	VMOVNT	%VMM(0), (VEC_SIZE * 2)(%rdi)
 	VMOVNT	%VMM(0), (VEC_SIZE * 3)(%rdi)
 	subq	$(VEC_SIZE * -4), %rdi
 	cmpq	%rdx, %rdi
 	jb	L(nt_loop)
 	sfence
 	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdx)
 	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdx)
 	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdx)
 	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rdx)
 	VZEROUPPER_RETURN
 #endif
 END(MEMSET_SYMBOL(__memset, unaligned_erms))