mirror of git://sourceware.org/git/glibc.git
AArch64: Improve strlen_asimd performance (bug 25824)
Optimize strlen using a mix of scalar and SIMD code. On modern micro architectures large strings are 2.6 times faster than existing strlen_asimd and 35% faster than the new MTE version of strlen. On a random strlen benchmark using small sizes the speedup is 7% vs strlen_asimd and 40% vs the MTE strlen. This fixes the main strlen regressions on Cortex-A53 and other cores with a simple Neon unit. Rename __strlen_generic to __strlen_mte, and select strlen_asimd when MTE is not enabled (this is waiting on support for a HWCAP_MTE bit). This fixes big-endian bug 25824. Passes GLIBC regression tests. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
This commit is contained in:
parent
76b8442db5
commit
f46ef33ad1
|
|
@ -3,5 +3,5 @@ sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx
|
||||||
memcpy_falkor \
|
memcpy_falkor \
|
||||||
memset_generic memset_falkor memset_emag memset_kunpeng \
|
memset_generic memset_falkor memset_emag memset_kunpeng \
|
||||||
memchr_generic memchr_nosimd \
|
memchr_generic memchr_nosimd \
|
||||||
strlen_generic strlen_asimd
|
strlen_mte strlen_asimd
|
||||||
endif
|
endif
|
||||||
|
|
|
||||||
|
|
@ -63,7 +63,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
|
||||||
IFUNC_IMPL (i, name, strlen,
|
IFUNC_IMPL (i, name, strlen,
|
||||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
|
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
|
||||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
|
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte))
|
||||||
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,17 +26,15 @@
|
||||||
# include <string.h>
|
# include <string.h>
|
||||||
# include <init-arch.h>
|
# include <init-arch.h>
|
||||||
|
|
||||||
#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
|
/* This should check HWCAP_MTE when it is available. */
|
||||||
|
#define MTE_ENABLED() (false)
|
||||||
|
|
||||||
extern __typeof (__redirect_strlen) __strlen;
|
extern __typeof (__redirect_strlen) __strlen;
|
||||||
|
|
||||||
extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
|
extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden;
|
||||||
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
|
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
|
||||||
|
|
||||||
libc_ifunc (__strlen,
|
libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd));
|
||||||
(USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr)
|
|
||||||
? __strlen_asimd
|
|
||||||
:__strlen_generic));
|
|
||||||
|
|
||||||
# undef strlen
|
# undef strlen
|
||||||
strong_alias (__strlen, strlen);
|
strong_alias (__strlen, strlen);
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
/* Strlen implementation that uses ASIMD instructions for load and NULL checks.
|
/* Optimized strlen implementation using SIMD.
|
||||||
Copyright (C) 2018-2020 Free Software Foundation, Inc.
|
Copyright (C) 2018-2020 Free Software Foundation, Inc.
|
||||||
|
|
||||||
This file is part of the GNU C Library.
|
This file is part of the GNU C Library.
|
||||||
|
|
@ -20,80 +20,90 @@
|
||||||
#include <sysdep.h>
|
#include <sysdep.h>
|
||||||
|
|
||||||
/* Assumptions:
|
/* Assumptions:
|
||||||
|
*
|
||||||
|
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||||
|
* Not MTE compatible.
|
||||||
|
*/
|
||||||
|
|
||||||
ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k. */
|
#define srcin x0
|
||||||
|
#define len x0
|
||||||
|
|
||||||
|
#define src x1
|
||||||
|
#define data1 x2
|
||||||
|
#define data2 x3
|
||||||
|
#define has_nul1 x4
|
||||||
|
#define has_nul2 x5
|
||||||
|
#define tmp1 x4
|
||||||
|
#define tmp2 x5
|
||||||
|
#define tmp3 x6
|
||||||
|
#define tmp4 x7
|
||||||
|
#define zeroones x8
|
||||||
|
|
||||||
|
#define maskv v0
|
||||||
|
#define maskd d0
|
||||||
|
#define dataq1 q1
|
||||||
|
#define dataq2 q2
|
||||||
|
#define datav1 v1
|
||||||
|
#define datav2 v2
|
||||||
|
#define tmp x2
|
||||||
|
#define tmpw w2
|
||||||
|
#define synd x3
|
||||||
|
#define shift x4
|
||||||
|
|
||||||
|
/* For the first 32 bytes, NUL detection works on the principle that
|
||||||
|
(X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
|
||||||
|
byte is zero, and can be done in parallel across the entire word. */
|
||||||
|
|
||||||
|
#define REP8_01 0x0101010101010101
|
||||||
|
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||||
|
|
||||||
/* To test the page crossing code path more thoroughly, compile with
|
/* To test the page crossing code path more thoroughly, compile with
|
||||||
-DTEST_PAGE_CROSS - this will force all calls through the slower
|
-DTEST_PAGE_CROSS - this will force all calls through the slower
|
||||||
entry path. This option is not intended for production use. */
|
entry path. This option is not intended for production use. */
|
||||||
|
|
||||||
/* Arguments and results. */
|
|
||||||
#define srcin x0
|
|
||||||
#define len x0
|
|
||||||
|
|
||||||
/* Locals and temporaries. */
|
|
||||||
#define src x1
|
|
||||||
#define data1 x2
|
|
||||||
#define data2 x3
|
|
||||||
#define has_nul1 x4
|
|
||||||
#define has_nul2 x5
|
|
||||||
#define tmp1 x4
|
|
||||||
#define tmp2 x5
|
|
||||||
#define tmp3 x6
|
|
||||||
#define tmp4 x7
|
|
||||||
#define zeroones x8
|
|
||||||
#define dataq q2
|
|
||||||
#define datav v2
|
|
||||||
#define datab2 b3
|
|
||||||
#define dataq2 q3
|
|
||||||
#define datav2 v3
|
|
||||||
|
|
||||||
#define REP8_01 0x0101010101010101
|
|
||||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
|
||||||
|
|
||||||
#ifdef TEST_PAGE_CROSS
|
#ifdef TEST_PAGE_CROSS
|
||||||
# define MIN_PAGE_SIZE 16
|
# define MIN_PAGE_SIZE 32
|
||||||
#else
|
#else
|
||||||
# define MIN_PAGE_SIZE 4096
|
# define MIN_PAGE_SIZE 4096
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Since strings are short on average, we check the first 16 bytes
|
/* Core algorithm:
|
||||||
of the string for a NUL character. In order to do an unaligned load
|
|
||||||
safely we have to do a page cross check first. If there is a NUL
|
|
||||||
byte we calculate the length from the 2 8-byte words using
|
|
||||||
conditional select to reduce branch mispredictions (it is unlikely
|
|
||||||
strlen_asimd will be repeatedly called on strings with the same
|
|
||||||
length).
|
|
||||||
|
|
||||||
If the string is longer than 16 bytes, we align src so don't need
|
Since strings are short on average, we check the first 32 bytes of the
|
||||||
further page cross checks, and process 16 bytes per iteration.
|
string for a NUL character without aligning the string. In order to use
|
||||||
|
unaligned loads safely we must do a page cross check first.
|
||||||
|
|
||||||
If the page cross check fails, we read 16 bytes from an aligned
|
If there is a NUL byte we calculate the length from the 2 8-byte words
|
||||||
address, remove any characters before the string, and continue
|
using conditional select to reduce branch mispredictions (it is unlikely
|
||||||
in the main loop using aligned loads. Since strings crossing a
|
strlen will be repeatedly called on strings with the same length).
|
||||||
page in the first 16 bytes are rare (probability of
|
|
||||||
16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
|
|
||||||
|
|
||||||
AArch64 systems have a minimum page size of 4k. We don't bother
|
If the string is longer than 32 bytes, align src so we don't need further
|
||||||
checking for larger page sizes - the cost of setting up the correct
|
page cross checks, and process 32 bytes per iteration using a fast SIMD
|
||||||
page size is just not worth the extra gain from a small reduction in
|
loop.
|
||||||
the cases taking the slow path. Note that we only care about
|
|
||||||
whether the first fetch, which may be misaligned, crosses a page
|
If the page cross check fails, we read 32 bytes from an aligned address,
|
||||||
boundary. */
|
and ignore any characters before the string. If it contains a NUL
|
||||||
|
character, return the length, if not, continue in the main loop. */
|
||||||
|
|
||||||
|
ENTRY (__strlen_asimd)
|
||||||
|
DELOUSE (0)
|
||||||
|
|
||||||
ENTRY_ALIGN (__strlen_asimd, 6)
|
|
||||||
DELOUSE (0)
|
|
||||||
DELOUSE (1)
|
|
||||||
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
||||||
mov zeroones, REP8_01
|
cmp tmp1, MIN_PAGE_SIZE - 32
|
||||||
cmp tmp1, MIN_PAGE_SIZE - 16
|
b.hi L(page_cross)
|
||||||
b.gt L(page_cross)
|
|
||||||
|
/* Look for a NUL byte in the first 16 bytes. */
|
||||||
ldp data1, data2, [srcin]
|
ldp data1, data2, [srcin]
|
||||||
|
mov zeroones, REP8_01
|
||||||
|
|
||||||
#ifdef __AARCH64EB__
|
#ifdef __AARCH64EB__
|
||||||
|
/* For big-endian, carry propagation (if the final byte in the
|
||||||
|
string is 0x01) means we cannot use has_nul1/2 directly.
|
||||||
|
Since we expect strings to be small and early-exit,
|
||||||
|
byte-swap the data now so has_null1/2 will be correct. */
|
||||||
rev data1, data1
|
rev data1, data1
|
||||||
rev data2, data2
|
rev data2, data2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sub tmp1, data1, zeroones
|
sub tmp1, data1, zeroones
|
||||||
orr tmp2, data1, REP8_7f
|
orr tmp2, data1, REP8_7f
|
||||||
sub tmp3, data2, zeroones
|
sub tmp3, data2, zeroones
|
||||||
|
|
@ -101,78 +111,105 @@ ENTRY_ALIGN (__strlen_asimd, 6)
|
||||||
bics has_nul1, tmp1, tmp2
|
bics has_nul1, tmp1, tmp2
|
||||||
bic has_nul2, tmp3, tmp4
|
bic has_nul2, tmp3, tmp4
|
||||||
ccmp has_nul2, 0, 0, eq
|
ccmp has_nul2, 0, 0, eq
|
||||||
beq L(main_loop_entry)
|
b.eq L(bytes16_31)
|
||||||
|
|
||||||
|
/* Find the exact offset of the first NUL byte in the first 16 bytes
|
||||||
|
from the string start. Enter with C = has_nul1 == 0. */
|
||||||
csel has_nul1, has_nul1, has_nul2, cc
|
csel has_nul1, has_nul1, has_nul2, cc
|
||||||
mov len, 8
|
mov len, 8
|
||||||
rev has_nul1, has_nul1
|
rev has_nul1, has_nul1
|
||||||
clz tmp1, has_nul1
|
|
||||||
csel len, xzr, len, cc
|
csel len, xzr, len, cc
|
||||||
|
clz tmp1, has_nul1
|
||||||
add len, len, tmp1, lsr 3
|
add len, len, tmp1, lsr 3
|
||||||
ret
|
ret
|
||||||
|
|
||||||
L(main_loop_entry):
|
.p2align 3
|
||||||
bic src, srcin, 15
|
/* Look for a NUL byte at offset 16..31 in the string. */
|
||||||
sub src, src, 16
|
L(bytes16_31):
|
||||||
|
ldp data1, data2, [srcin, 16]
|
||||||
L(main_loop):
|
|
||||||
ldr dataq, [src, 32]!
|
|
||||||
L(page_cross_entry):
|
|
||||||
/* Get the minimum value and keep going if it is not zero. */
|
|
||||||
uminv datab2, datav.16b
|
|
||||||
mov tmp1, datav2.d[0]
|
|
||||||
cbz tmp1, L(tail)
|
|
||||||
ldr dataq, [src, 16]
|
|
||||||
uminv datab2, datav.16b
|
|
||||||
mov tmp1, datav2.d[0]
|
|
||||||
cbnz tmp1, L(main_loop)
|
|
||||||
add src, src, 16
|
|
||||||
|
|
||||||
L(tail):
|
|
||||||
#ifdef __AARCH64EB__
|
#ifdef __AARCH64EB__
|
||||||
rev64 datav.16b, datav.16b
|
|
||||||
#endif
|
|
||||||
/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
|
|
||||||
pair of scalars and then compute the length from the earliest NULL
|
|
||||||
byte. */
|
|
||||||
cmeq datav.16b, datav.16b, #0
|
|
||||||
mov data1, datav.d[0]
|
|
||||||
mov data2, datav.d[1]
|
|
||||||
cmp data1, 0
|
|
||||||
csel data1, data1, data2, ne
|
|
||||||
sub len, src, srcin
|
|
||||||
rev data1, data1
|
rev data1, data1
|
||||||
add tmp2, len, 8
|
rev data2, data2
|
||||||
clz tmp1, data1
|
#endif
|
||||||
csel len, len, tmp2, ne
|
sub tmp1, data1, zeroones
|
||||||
|
orr tmp2, data1, REP8_7f
|
||||||
|
sub tmp3, data2, zeroones
|
||||||
|
orr tmp4, data2, REP8_7f
|
||||||
|
bics has_nul1, tmp1, tmp2
|
||||||
|
bic has_nul2, tmp3, tmp4
|
||||||
|
ccmp has_nul2, 0, 0, eq
|
||||||
|
b.eq L(loop_entry)
|
||||||
|
|
||||||
|
/* Find the exact offset of the first NUL byte at offset 16..31 from
|
||||||
|
the string start. Enter with C = has_nul1 == 0. */
|
||||||
|
csel has_nul1, has_nul1, has_nul2, cc
|
||||||
|
mov len, 24
|
||||||
|
rev has_nul1, has_nul1
|
||||||
|
mov tmp3, 16
|
||||||
|
clz tmp1, has_nul1
|
||||||
|
csel len, tmp3, len, cc
|
||||||
add len, len, tmp1, lsr 3
|
add len, len, tmp1, lsr 3
|
||||||
ret
|
ret
|
||||||
|
|
||||||
/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
|
L(loop_entry):
|
||||||
srcin to 0xff, so we ignore any NUL bytes before the string.
|
bic src, srcin, 31
|
||||||
Then continue in the aligned loop. */
|
|
||||||
L(page_cross):
|
.p2align 5
|
||||||
mov tmp3, 63
|
L(loop):
|
||||||
bic src, srcin, 15
|
ldp dataq1, dataq2, [src, 32]!
|
||||||
and tmp1, srcin, 7
|
uminp maskv.16b, datav1.16b, datav2.16b
|
||||||
ands tmp2, srcin, 8
|
uminp maskv.16b, maskv.16b, maskv.16b
|
||||||
ldr dataq, [src]
|
cmeq maskv.8b, maskv.8b, 0
|
||||||
lsl tmp1, tmp1, 3
|
fmov synd, maskd
|
||||||
csel tmp2, tmp2, tmp1, eq
|
cbz synd, L(loop)
|
||||||
csel tmp1, tmp1, tmp3, eq
|
|
||||||
mov tmp4, -1
|
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
|
||||||
|
cmeq maskv.16b, datav1.16b, 0
|
||||||
|
sub len, src, srcin
|
||||||
|
tst synd, 0xffffffff
|
||||||
|
b.ne 1f
|
||||||
|
cmeq maskv.16b, datav2.16b, 0
|
||||||
|
add len, len, 16
|
||||||
|
1:
|
||||||
|
/* Generate a bitmask and compute correct byte offset. */
|
||||||
#ifdef __AARCH64EB__
|
#ifdef __AARCH64EB__
|
||||||
/* Big-endian. Early bytes are at MSB. */
|
bic maskv.8h, 0xf0
|
||||||
lsr tmp1, tmp4, tmp1
|
|
||||||
lsr tmp2, tmp4, tmp2
|
|
||||||
#else
|
#else
|
||||||
/* Little-endian. Early bytes are at LSB. */
|
bic maskv.8h, 0x0f, lsl 8
|
||||||
lsl tmp1, tmp4, tmp1
|
|
||||||
lsl tmp2, tmp4, tmp2
|
|
||||||
#endif
|
#endif
|
||||||
mov datav2.d[0], tmp1
|
umaxp maskv.16b, maskv.16b, maskv.16b
|
||||||
mov datav2.d[1], tmp2
|
fmov synd, maskd
|
||||||
orn datav.16b, datav.16b, datav2.16b
|
#ifndef __AARCH64EB__
|
||||||
b L(page_cross_entry)
|
rbit synd, synd
|
||||||
|
#endif
|
||||||
|
clz tmp, synd
|
||||||
|
add len, len, tmp, lsr 2
|
||||||
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
|
||||||
|
L(page_cross):
|
||||||
|
bic src, srcin, 31
|
||||||
|
mov tmpw, 0x0c03
|
||||||
|
movk tmpw, 0xc030, lsl 16
|
||||||
|
ld1 {datav1.16b, datav2.16b}, [src]
|
||||||
|
dup maskv.4s, tmpw
|
||||||
|
cmeq datav1.16b, datav1.16b, 0
|
||||||
|
cmeq datav2.16b, datav2.16b, 0
|
||||||
|
and datav1.16b, datav1.16b, maskv.16b
|
||||||
|
and datav2.16b, datav2.16b, maskv.16b
|
||||||
|
addp maskv.16b, datav1.16b, datav2.16b
|
||||||
|
addp maskv.16b, maskv.16b, maskv.16b
|
||||||
|
fmov synd, maskd
|
||||||
|
lsl shift, srcin, 1
|
||||||
|
lsr synd, synd, shift
|
||||||
|
cbz synd, L(loop)
|
||||||
|
|
||||||
|
rbit synd, synd
|
||||||
|
clz len, synd
|
||||||
|
lsr len, len, 1
|
||||||
|
ret
|
||||||
|
|
||||||
END (__strlen_asimd)
|
END (__strlen_asimd)
|
||||||
weak_alias (__strlen_asimd, strlen_asimd)
|
weak_alias (__strlen_asimd, strlen_asimd)
|
||||||
libc_hidden_builtin_def (strlen_asimd)
|
libc_hidden_builtin_def (strlen_asimd)
|
||||||
|
|
|
||||||
|
|
@ -17,14 +17,14 @@
|
||||||
<https://www.gnu.org/licenses/>. */
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
/* The actual strlen code is in ../strlen.S. If we are building libc this file
|
/* The actual strlen code is in ../strlen.S. If we are building libc this file
|
||||||
defines __strlen_generic. Otherwise the include of ../strlen.S will define
|
defines __strlen_mte. Otherwise the include of ../strlen.S will define
|
||||||
the normal __strlen entry points. */
|
the normal __strlen entry points. */
|
||||||
|
|
||||||
#include <sysdep.h>
|
#include <sysdep.h>
|
||||||
|
|
||||||
#if IS_IN (libc)
|
#if IS_IN (libc)
|
||||||
|
|
||||||
# define STRLEN __strlen_generic
|
# define STRLEN __strlen_mte
|
||||||
|
|
||||||
/* Do not hide the generic version of strlen, we use it internally. */
|
/* Do not hide the generic version of strlen, we use it internally. */
|
||||||
# undef libc_hidden_builtin_def
|
# undef libc_hidden_builtin_def
|
||||||
|
|
@ -32,7 +32,7 @@
|
||||||
|
|
||||||
# ifdef SHARED
|
# ifdef SHARED
|
||||||
/* It doesn't make sense to send libc-internal strlen calls through a PLT. */
|
/* It doesn't make sense to send libc-internal strlen calls through a PLT. */
|
||||||
.globl __GI_strlen; __GI_strlen = __strlen_generic
|
.globl __GI_strlen; __GI_strlen = __strlen_mte
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Loading…
Reference in New Issue