mirror of git://sourceware.org/git/glibc.git
x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
Slightly faster method of doing TOLOWER that saves an instruction. Also replace the hard coded 5-byte no with .p2align 4. On builds with CET enabled this misaligned entry to strcasecmp. geometric_mean(N=40) of all benchmarks New / Original: .920 All string/memory tests pass. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
670b54bc58
commit
d154758e61
|
|
@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
|
||||||
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
||||||
mov %fs:(%rax),%RDX_LP
|
mov %fs:(%rax),%RDX_LP
|
||||||
|
|
||||||
// XXX 5 byte should be before the function
|
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||||
/* 5-byte NOP. */
|
.p2align 4
|
||||||
.byte 0x0f,0x1f,0x44,0x00,0x00
|
|
||||||
END (GLABEL(__strcasecmp))
|
END (GLABEL(__strcasecmp))
|
||||||
/* FALLTHROUGH to strcasecmp_l. */
|
/* FALLTHROUGH to strcasecmp_l. */
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
|
||||||
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
||||||
mov %fs:(%rax),%RCX_LP
|
mov %fs:(%rax),%RCX_LP
|
||||||
|
|
||||||
// XXX 5 byte should be before the function
|
/* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
||||||
/* 5-byte NOP. */
|
.p2align 4
|
||||||
.byte 0x0f,0x1f,0x44,0x00,0x00
|
|
||||||
END (GLABEL(__strncasecmp))
|
END (GLABEL(__strncasecmp))
|
||||||
/* FALLTHROUGH to strncasecmp_l. */
|
/* FALLTHROUGH to strncasecmp_l. */
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -169,27 +167,22 @@ STRCMP_SSE42:
|
||||||
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
||||||
.section .rodata.cst16,"aM",@progbits,16
|
.section .rodata.cst16,"aM",@progbits,16
|
||||||
.align 16
|
.align 16
|
||||||
LABEL(belowupper):
|
LABEL(lcase_min):
|
||||||
.quad 0x4040404040404040
|
.quad 0x3f3f3f3f3f3f3f3f
|
||||||
.quad 0x4040404040404040
|
.quad 0x3f3f3f3f3f3f3f3f
|
||||||
LABEL(topupper):
|
LABEL(lcase_max):
|
||||||
# ifdef USE_AVX
|
.quad 0x9999999999999999
|
||||||
.quad 0x5a5a5a5a5a5a5a5a
|
.quad 0x9999999999999999
|
||||||
.quad 0x5a5a5a5a5a5a5a5a
|
LABEL(case_add):
|
||||||
# else
|
|
||||||
.quad 0x5b5b5b5b5b5b5b5b
|
|
||||||
.quad 0x5b5b5b5b5b5b5b5b
|
|
||||||
# endif
|
|
||||||
LABEL(touppermask):
|
|
||||||
.quad 0x2020202020202020
|
.quad 0x2020202020202020
|
||||||
.quad 0x2020202020202020
|
.quad 0x2020202020202020
|
||||||
.previous
|
.previous
|
||||||
movdqa LABEL(belowupper)(%rip), %xmm4
|
movdqa LABEL(lcase_min)(%rip), %xmm4
|
||||||
# define UCLOW_reg %xmm4
|
# define LCASE_MIN_reg %xmm4
|
||||||
movdqa LABEL(topupper)(%rip), %xmm5
|
movdqa LABEL(lcase_max)(%rip), %xmm5
|
||||||
# define UCHIGH_reg %xmm5
|
# define LCASE_MAX_reg %xmm5
|
||||||
movdqa LABEL(touppermask)(%rip), %xmm6
|
movdqa LABEL(case_add)(%rip), %xmm6
|
||||||
# define LCQWORD_reg %xmm6
|
# define CASE_ADD_reg %xmm6
|
||||||
#endif
|
#endif
|
||||||
cmp $0x30, %ecx
|
cmp $0x30, %ecx
|
||||||
ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
|
ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
|
||||||
|
|
@ -200,32 +193,26 @@ LABEL(touppermask):
|
||||||
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
||||||
# ifdef USE_AVX
|
# ifdef USE_AVX
|
||||||
# define TOLOWER(reg1, reg2) \
|
# define TOLOWER(reg1, reg2) \
|
||||||
vpcmpgtb UCLOW_reg, reg1, %xmm7; \
|
vpaddb LCASE_MIN_reg, reg1, %xmm7; \
|
||||||
vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
|
vpaddb LCASE_MIN_reg, reg2, %xmm8; \
|
||||||
vpcmpgtb UCLOW_reg, reg2, %xmm9; \
|
vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
|
||||||
vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
|
vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
|
||||||
vpandn %xmm7, %xmm8, %xmm8; \
|
vpandn CASE_ADD_reg, %xmm7, %xmm7; \
|
||||||
vpandn %xmm9, %xmm10, %xmm10; \
|
vpandn CASE_ADD_reg, %xmm8, %xmm8; \
|
||||||
vpand LCQWORD_reg, %xmm8, %xmm8; \
|
vpaddb %xmm7, reg1, reg1; \
|
||||||
vpand LCQWORD_reg, %xmm10, %xmm10; \
|
vpaddb %xmm8, reg2, reg2
|
||||||
vpor reg1, %xmm8, reg1; \
|
|
||||||
vpor reg2, %xmm10, reg2
|
|
||||||
# else
|
# else
|
||||||
# define TOLOWER(reg1, reg2) \
|
# define TOLOWER(reg1, reg2) \
|
||||||
movdqa reg1, %xmm7; \
|
movdqa LCASE_MIN_reg, %xmm7; \
|
||||||
movdqa UCHIGH_reg, %xmm8; \
|
movdqa LCASE_MIN_reg, %xmm8; \
|
||||||
movdqa reg2, %xmm9; \
|
paddb reg1, %xmm7; \
|
||||||
movdqa UCHIGH_reg, %xmm10; \
|
paddb reg2, %xmm8; \
|
||||||
pcmpgtb UCLOW_reg, %xmm7; \
|
pcmpgtb LCASE_MAX_reg, %xmm7; \
|
||||||
pcmpgtb reg1, %xmm8; \
|
pcmpgtb LCASE_MAX_reg, %xmm8; \
|
||||||
pcmpgtb UCLOW_reg, %xmm9; \
|
pandn CASE_ADD_reg, %xmm7; \
|
||||||
pcmpgtb reg2, %xmm10; \
|
pandn CASE_ADD_reg, %xmm8; \
|
||||||
pand %xmm8, %xmm7; \
|
paddb %xmm7, reg1; \
|
||||||
pand %xmm10, %xmm9; \
|
paddb %xmm8, reg2
|
||||||
pand LCQWORD_reg, %xmm7; \
|
|
||||||
pand LCQWORD_reg, %xmm9; \
|
|
||||||
por %xmm7, reg1; \
|
|
||||||
por %xmm9, reg2
|
|
||||||
# endif
|
# endif
|
||||||
TOLOWER (%xmm1, %xmm2)
|
TOLOWER (%xmm1, %xmm2)
|
||||||
#else
|
#else
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue