AArch64: Improve strlen_asimd performance (bug 25824)

Optimize strlen using a mix of scalar and SIMD code. On modern micro architectures large strings are 2.6 times faster than existing strlen_asimd and 35% faster than the new MTE version of strlen. On a random strlen benchmark using small sizes the speedup is 7% vs strlen_asimd and 40% vs the MTE strlen. This fixes the main strlen regressions on Cortex-A53 and other cores with a simple Neon unit. Rename __strlen_generic to __strlen_mte, and select strlen_asimd when MTE is not enabled (this is waiting on support for a HWCAP_MTE bit). This fixes big-endian bug 25824. Passes GLIBC regression tests. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
2020-07-17 14:09:36 +01:00 · 2020-07-17 14:09:36 +01:00 · f46ef33ad1
parent 76b8442db5
commit f46ef33ad1
5 changed files with 157 additions and 122 deletions
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@ -3,5 +3,5 @@ sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx
 		   memcpy_falkor \
 		   memset_generic memset_falkor memset_emag memset_kunpeng \
 		   memchr_generic memchr_nosimd \
-		   strlen_generic strlen_asimd
+		   strlen_mte strlen_asimd
 endif
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@ -63,7 +63,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
-	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte))
  return i;
 }
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@ -26,17 +26,15 @@
 # include <string.h>
 # include <init-arch.h>
-#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
+/* This should check HWCAP_MTE when it is available.  */
 #define MTE_ENABLED() (false)
 extern __typeof (__redirect_strlen) __strlen;
-extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
-libc_ifunc (__strlen,
+libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd));
 	    (USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr)
 	    ? __strlen_asimd
 	    :__strlen_generic));
 # undef strlen
 strong_alias (__strlen, strlen);
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@ -1,4 +1,4 @@
-/* Strlen implementation that uses ASIMD instructions for load and NULL checks.
+/* Optimized strlen implementation using SIMD.
   Copyright (C) 2018-2020 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
@ -20,18 +20,14 @@
 #include <sysdep.h>
 /* Assumptions:
 *
 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
 * Not MTE compatible.
 */
   ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k.  */
 /* To test the page crossing code path more thoroughly, compile with
   -DTEST_PAGE_CROSS - this will force all calls through the slower
   entry path.  This option is not intended for production use.  */
 /* Arguments and results.  */
 #define srcin	x0
 #define len	x0
 /* Locals and temporaries.  */
 #define src	x1
 #define data1	x2
 #define data2	x3
@ -42,58 +38,72 @@
 #define tmp3	x6
 #define tmp4	x7
 #define zeroones x8
-#define dataq		q2
+
-#define datav		v2
+#define maskv	v0
-#define datab2		b3
+#define maskd	d0
-#define dataq2		q3
+#define dataq1	q1
-#define datav2		v3
+#define dataq2	q2
 #define datav1	v1
 #define datav2	v2
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
 #define shift	x4
 /* For the first 32 bytes, NUL detection works on the principle that
   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
   byte is zero, and can be done in parallel across the entire word.  */
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 /* To test the page crossing code path more thoroughly, compile with
   -DTEST_PAGE_CROSS - this will force all calls through the slower
   entry path.  This option is not intended for production use.  */
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 16
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
-	/* Since strings are short on average, we check the first 16 bytes
+/* Core algorithm:
 	   of the string for a NUL character.  In order to do an unaligned load
 	   safely we have to do a page cross check first.  If there is a NUL
 	   byte we calculate the length from the 2 8-byte words using
 	   conditional select to reduce branch mispredictions (it is unlikely
 	   strlen_asimd will be repeatedly called on strings with the same
 	   length).
-	   If the string is longer than 16 bytes, we align src so don't need
+   Since strings are short on average, we check the first 32 bytes of the
-	   further page cross checks, and process 16 bytes per iteration.
+   string for a NUL character without aligning the string.  In order to use
   unaligned loads safely we must do a page cross check first.
-	   If the page cross check fails, we read 16 bytes from an aligned
+   If there is a NUL byte we calculate the length from the 2 8-byte words
-	   address, remove any characters before the string, and continue
+   using conditional select to reduce branch mispredictions (it is unlikely
-	   in the main loop using aligned loads.  Since strings crossing a
+   strlen will be repeatedly called on strings with the same length).
 	   page in the first 16 bytes are rare (probability of
 	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
+   If the string is longer than 32 bytes, align src so we don't need further
-	   checking for larger page sizes - the cost of setting up the correct
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
-	   page size is just not worth the extra gain from a small reduction in
+   loop.
 	   the cases taking the slow path.  Note that we only care about
 	   whether the first fetch, which may be misaligned, crosses a page
 	   boundary.  */
-ENTRY_ALIGN (__strlen_asimd, 6)
+   If the page cross check fails, we read 32 bytes from an aligned address,
   and ignore any characters before the string.  If it contains a NUL
   character, return the length, if not, continue in the main loop.  */
 ENTRY (__strlen_asimd)
        DELOUSE (0)
-	DELOUSE (1)
+
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 32
-	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.hi	L(page_cross)
-	b.gt	L(page_cross)
+
 	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
 	mov	zeroones, REP8_01
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul1/2 directly.
 	   Since we expect strings to be small and early-exit,
 	   byte-swap the data now so has_null1/2 will be correct.  */
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
@ -101,78 +111,105 @@ ENTRY_ALIGN (__strlen_asimd, 6)
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
+	b.eq	L(bytes16_31)
 	/* Find the exact offset of the first NUL byte in the first 16 bytes
 	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
 	clz	tmp1, has_nul1
 	csel	len, xzr, len, cc
 	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
-L(main_loop_entry):
+	.p2align 3
-	bic	src, srcin, 15
+	/* Look for a NUL byte at offset 16..31 in the string.  */
-	sub	src, src, 16
+L(bytes16_31):
-
+	ldp	data1, data2, [srcin, 16]
 L(main_loop):
 	ldr	dataq, [src, 32]!
 L(page_cross_entry):
 	/* Get the minimum value and keep going if it is not zero.  */
 	uminv	datab2, datav.16b
 	mov	tmp1, datav2.d[0]
 	cbz	tmp1, L(tail)
 	ldr	dataq, [src, 16]
 	uminv	datab2, datav.16b
 	mov	tmp1, datav2.d[0]
 	cbnz	tmp1, L(main_loop)
 	add	src, src, 16
 L(tail):
 #ifdef __AARCH64EB__
 	rev64	datav.16b, datav.16b
 #endif
 	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
 	   pair of scalars and then compute the length from the earliest NULL
 	   byte.  */
 	cmeq	datav.16b, datav.16b, #0
 	mov	data1, datav.d[0]
 	mov	data2, datav.d[1]
 	cmp	data1, 0
 	csel	data1, data1, data2, ne
 	sub	len, src, srcin
 	rev	data1, data1
-	add	tmp2, len, 8
+	rev	data2, data2
-	clz	tmp1, data1
+#endif
-	csel	len, len, tmp2, ne
+	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
 	b.eq	L(loop_entry)
 	/* Find the exact offset of the first NUL byte at offset 16..31 from
 	   the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 24
 	rev	has_nul1, has_nul1
 	mov	tmp3, 16
 	clz	tmp1, has_nul1
 	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+L(loop_entry):
-	   srcin to 0xff, so we ignore any NUL bytes before the string.
+	bic	src, srcin, 31
-	   Then continue in the aligned loop.  */
+
-L(page_cross):
+	.p2align 5
-	mov	tmp3, 63
+L(loop):
-	bic	src, srcin, 15
+	ldp	dataq1, dataq2, [src, 32]!
-	and	tmp1, srcin, 7
+	uminp	maskv.16b, datav1.16b, datav2.16b
-	ands	tmp2, srcin, 8
+	uminp	maskv.16b, maskv.16b, maskv.16b
-	ldr	dataq, [src]
+	cmeq	maskv.8b, maskv.8b, 0
-	lsl	tmp1, tmp1, 3
+	fmov	synd, maskd
-	csel	tmp2, tmp2, tmp1, eq
+	cbz	synd, L(loop)
-	csel	tmp1, tmp1, tmp3, eq
+
-	mov	tmp4, -1
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
 	tst	synd, 0xffffffff
 	b.ne	1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
 #ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
+	bic	maskv.8h, 0xf0
 	lsr	tmp1, tmp4, tmp1
 	lsr	tmp2, tmp4, tmp2
 #else
-	/* Little-endian.  Early bytes are at LSB.  */
+	bic	maskv.8h, 0x0f, lsl 8
 	lsl	tmp1, tmp4, tmp1
 	lsl	tmp2, tmp4, tmp2
 #endif
-	mov	datav2.d[0], tmp1
+	umaxp	maskv.16b, maskv.16b, maskv.16b
-	mov	datav2.d[1], tmp2
+	fmov	synd, maskd
-	orn	datav.16b, datav.16b, datav2.16b
+#ifndef __AARCH64EB__
-	b	L(page_cross_entry)
+	rbit	synd, synd
 #endif
 	clz	tmp, synd
 	add	len, len, tmp, lsr 2
 	ret
        .p2align 4
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03
 	movk	tmpw, 0xc030, lsl 16
 	ld1	{datav1.16b, datav2.16b}, [src]
 	dup	maskv.4s, tmpw
 	cmeq	datav1.16b, datav1.16b, 0
 	cmeq	datav2.16b, datav2.16b, 0
 	and	datav1.16b, datav1.16b, maskv.16b
 	and	datav2.16b, datav2.16b, maskv.16b
 	addp	maskv.16b, datav1.16b, datav2.16b
 	addp	maskv.16b, maskv.16b, maskv.16b
 	fmov	synd, maskd
 	lsl	shift, srcin, 1
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
 	rbit	synd, synd
 	clz	len, synd
 	lsr	len, len, 1
 	ret
 END (__strlen_asimd)
 weak_alias (__strlen_asimd, strlen_asimd)
 libc_hidden_builtin_def (strlen_asimd)
--- a/sysdeps/aarch64/multiarch/strlen_generic.S
+++ b/sysdeps/aarch64/multiarch/strlen_generic.S
@ -17,14 +17,14 @@
   <https://www.gnu.org/licenses/>.  */
 /* The actual strlen code is in ../strlen.S.  If we are building libc this file
-   defines __strlen_generic.  Otherwise the include of ../strlen.S will define
+   defines __strlen_mte.  Otherwise the include of ../strlen.S will define
   the normal __strlen  entry points.  */
 #include <sysdep.h>
 #if IS_IN (libc)
-# define STRLEN __strlen_generic
+# define STRLEN __strlen_mte
 /* Do not hide the generic version of strlen, we use it internally.  */
 # undef libc_hidden_builtin_def
@ -32,7 +32,7 @@
 # ifdef SHARED
 /* It doesn't make sense to send libc-internal strlen calls through a PLT. */
-	.globl __GI_strlen; __GI_strlen = __strlen_generic
+	.globl __GI_strlen; __GI_strlen = __strlen_mte
 # endif
 #endif