mirror of git://sourceware.org/git/glibc.git
x86: Improve large memset perf with non-temporal stores [RHEL-29312]
Previously we use `rep stosb` for all medium/large memsets. This is notably worse than non-temporal stores for large (above a few MBs) memsets. See: https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing For data using different stategies for large memset on ICX and SKX. Using non-temporal stores can be up to 3x faster on ICX and 2x faster on SKX. Historically, these numbers would not have been so good because of the zero-over-zero writeback optimization that `rep stosb` is able to do. But, the zero-over-zero writeback optimization has been removed as a potential side-channel attack, so there is no longer any good reason to only rely on `rep stosb` for large memsets. On the flip size, non-temporal writes can avoid data in their RFO requests saving memory bandwidth. All of the other changes to the file are to re-organize the code-blocks to maintain "good" alignment given the new code added in the `L(stosb_local)` case. The results from running the GLIBC memset benchmarks on TGL-client for N=20 runs: Geometric Mean across the suite New / Old EXEX256: 0.979 Geometric Mean across the suite New / Old EXEX512: 0.979 Geometric Mean across the suite New / Old AVX2 : 0.986 Geometric Mean across the suite New / Old SSE2 : 0.979 Most of the cases are essentially unchanged, this is mostly to show that adding the non-temporal case didn't add any regressions to the other cases. The results on the memset-large benchmark suite on TGL-client for N=20 runs: Geometric Mean across the suite New / Old EXEX256: 0.926 Geometric Mean across the suite New / Old EXEX512: 0.925 Geometric Mean across the suite New / Old AVX2 : 0.928 Geometric Mean across the suite New / Old SSE2 : 0.924 So roughly a 7.5% speedup. This is lower than what we see on servers (likely because clients typically have faster single-core bandwidth so saving bandwidth on RFOs is less impactful), but still advantageous. Full test-suite passes on x86_64 w/ and w/o multiarch. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
53f9d74322
commit
5bf0ab8057
|
|
@ -21,10 +21,13 @@
|
||||||
2. If size is less than VEC, use integer register stores.
|
2. If size is less than VEC, use integer register stores.
|
||||||
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
||||||
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
||||||
5. On machines ERMS feature, if size is greater or equal than
|
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
|
||||||
__x86_rep_stosb_threshold then REP STOSB will be used.
|
4 VEC stores and store 4 * VEC at a time until done.
|
||||||
6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
6. On machines ERMS feature, if size is range
|
||||||
4 VEC stores and store 4 * VEC at a time until done. */
|
[__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
|
||||||
|
then REP STOSB will be used.
|
||||||
|
7. If size >= __x86_shared_non_temporal_threshold, use a
|
||||||
|
non-temporal stores. */
|
||||||
|
|
||||||
#include <sysdep.h>
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
|
@ -147,6 +150,41 @@ L(entry_from_wmemset):
|
||||||
VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx)
|
VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx)
|
||||||
VMOVU %VMM(0), (%rdi)
|
VMOVU %VMM(0), (%rdi)
|
||||||
VZEROUPPER_RETURN
|
VZEROUPPER_RETURN
|
||||||
|
|
||||||
|
/* If have AVX512 mask instructions put L(less_vec) close to
|
||||||
|
entry as it doesn't take much space and is likely a hot target. */
|
||||||
|
#ifdef USE_LESS_VEC_MASK_STORE
|
||||||
|
/* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */
|
||||||
|
.p2align 6,, 47
|
||||||
|
.p2align 4
|
||||||
|
L(less_vec):
|
||||||
|
L(less_vec_from_wmemset):
|
||||||
|
/* Less than 1 VEC. */
|
||||||
|
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||||
|
# error Unsupported VEC_SIZE!
|
||||||
|
# endif
|
||||||
|
/* Clear high bits from edi. Only keeping bits relevant to page
|
||||||
|
cross check. Note that we are using rax which is set in
|
||||||
|
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
||||||
|
andl $(PAGE_SIZE - 1), %edi
|
||||||
|
/* Check if VEC_SIZE store cross page. Mask stores suffer
|
||||||
|
serious performance degradation when it has to fault suppress. */
|
||||||
|
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
||||||
|
/* This is generally considered a cold target. */
|
||||||
|
ja L(cross_page)
|
||||||
|
# if VEC_SIZE > 32
|
||||||
|
movq $-1, %rcx
|
||||||
|
bzhiq %rdx, %rcx, %rcx
|
||||||
|
kmovq %rcx, %k1
|
||||||
|
# else
|
||||||
|
movl $-1, %ecx
|
||||||
|
bzhil %edx, %ecx, %ecx
|
||||||
|
kmovd %ecx, %k1
|
||||||
|
# endif
|
||||||
|
vmovdqu8 %VMM(0), (%rax){%k1}
|
||||||
|
VZEROUPPER_RETURN
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
END (MEMSET_SYMBOL (__memset, unaligned))
|
END (MEMSET_SYMBOL (__memset, unaligned))
|
||||||
|
|
||||||
|
|
@ -185,54 +223,6 @@ L(last_2x_vec):
|
||||||
#endif
|
#endif
|
||||||
VZEROUPPER_RETURN
|
VZEROUPPER_RETURN
|
||||||
|
|
||||||
/* If have AVX512 mask instructions put L(less_vec) close to
|
|
||||||
entry as it doesn't take much space and is likely a hot target.
|
|
||||||
*/
|
|
||||||
#ifdef USE_LESS_VEC_MASK_STORE
|
|
||||||
.p2align 4,, 10
|
|
||||||
L(less_vec):
|
|
||||||
L(less_vec_from_wmemset):
|
|
||||||
/* Less than 1 VEC. */
|
|
||||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
||||||
# error Unsupported VEC_SIZE!
|
|
||||||
# endif
|
|
||||||
/* Clear high bits from edi. Only keeping bits relevant to page
|
|
||||||
cross check. Note that we are using rax which is set in
|
|
||||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
|
||||||
andl $(PAGE_SIZE - 1), %edi
|
|
||||||
/* Check if VEC_SIZE store cross page. Mask stores suffer
|
|
||||||
serious performance degradation when it has to fault suppress.
|
|
||||||
*/
|
|
||||||
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
|
||||||
/* This is generally considered a cold target. */
|
|
||||||
ja L(cross_page)
|
|
||||||
# if VEC_SIZE > 32
|
|
||||||
movq $-1, %rcx
|
|
||||||
bzhiq %rdx, %rcx, %rcx
|
|
||||||
kmovq %rcx, %k1
|
|
||||||
# else
|
|
||||||
movl $-1, %ecx
|
|
||||||
bzhil %edx, %ecx, %ecx
|
|
||||||
kmovd %ecx, %k1
|
|
||||||
# endif
|
|
||||||
vmovdqu8 %VMM(0), (%rax){%k1}
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
# if defined USE_MULTIARCH && IS_IN (libc)
|
|
||||||
/* Include L(stosb_local) here if including L(less_vec) between
|
|
||||||
L(stosb_more_2x_vec) and ENTRY. This is to cache align the
|
|
||||||
L(stosb_more_2x_vec) target. */
|
|
||||||
.p2align 4,, 10
|
|
||||||
L(stosb_local):
|
|
||||||
movzbl %sil, %eax
|
|
||||||
mov %RDX_LP, %RCX_LP
|
|
||||||
mov %RDI_LP, %RDX_LP
|
|
||||||
rep stosb
|
|
||||||
mov %RDX_LP, %RAX_LP
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(stosb_more_2x_vec):
|
L(stosb_more_2x_vec):
|
||||||
|
|
@ -318,21 +308,33 @@ L(return_vzeroupper):
|
||||||
ret
|
ret
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
.p2align 4,, 10
|
#ifdef USE_WITH_AVX2
|
||||||
#ifndef USE_LESS_VEC_MASK_STORE
|
.p2align 4
|
||||||
# if defined USE_MULTIARCH && IS_IN (libc)
|
#else
|
||||||
|
.p2align 4,, 4
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
||||||
range for 2-byte jump encoding. */
|
range for 2-byte jump encoding. */
|
||||||
L(stosb_local):
|
L(stosb_local):
|
||||||
|
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||||
|
jae L(nt_memset)
|
||||||
movzbl %sil, %eax
|
movzbl %sil, %eax
|
||||||
mov %RDX_LP, %RCX_LP
|
mov %RDX_LP, %RCX_LP
|
||||||
mov %RDI_LP, %RDX_LP
|
mov %RDI_LP, %RDX_LP
|
||||||
rep stosb
|
rep stosb
|
||||||
|
# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
|
||||||
|
/* Use xchg to save 1-byte (this helps align targets below). */
|
||||||
|
xchg %RDX_LP, %RAX_LP
|
||||||
|
# else
|
||||||
mov %RDX_LP, %RAX_LP
|
mov %RDX_LP, %RAX_LP
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
# endif
|
||||||
|
VZEROUPPER_RETURN
|
||||||
|
#endif
|
||||||
|
#ifndef USE_LESS_VEC_MASK_STORE
|
||||||
/* Define L(less_vec) only if not otherwise defined. */
|
/* Define L(less_vec) only if not otherwise defined. */
|
||||||
.p2align 4
|
.p2align 4,, 12
|
||||||
L(less_vec):
|
L(less_vec):
|
||||||
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
||||||
xmm). This is only does anything for AVX2. */
|
xmm). This is only does anything for AVX2. */
|
||||||
|
|
@ -423,4 +425,35 @@ L(between_2_3):
|
||||||
movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
|
movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
|
||||||
#endif
|
#endif
|
||||||
ret
|
ret
|
||||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
||||||
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
|
# ifdef USE_WITH_AVX512
|
||||||
|
/* Force align so the loop doesn't cross a cache-line. */
|
||||||
|
.p2align 4
|
||||||
|
# endif
|
||||||
|
.p2align 4,, 7
|
||||||
|
/* Memset using non-temporal stores. */
|
||||||
|
L(nt_memset):
|
||||||
|
VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi)
|
||||||
|
leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx
|
||||||
|
/* Align DST. */
|
||||||
|
orq $(VEC_SIZE * 1 - 1), %rdi
|
||||||
|
incq %rdi
|
||||||
|
.p2align 4,, 7
|
||||||
|
L(nt_loop):
|
||||||
|
VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi)
|
||||||
|
VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi)
|
||||||
|
VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi)
|
||||||
|
VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi)
|
||||||
|
subq $(VEC_SIZE * -4), %rdi
|
||||||
|
cmpq %rdx, %rdi
|
||||||
|
jb L(nt_loop)
|
||||||
|
sfence
|
||||||
|
VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx)
|
||||||
|
VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx)
|
||||||
|
VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx)
|
||||||
|
VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx)
|
||||||
|
VZEROUPPER_RETURN
|
||||||
|
#endif
|
||||||
|
|
||||||
|
END(MEMSET_SYMBOL(__memset, unaligned_erms))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue