mirror of git://sourceware.org/git/glibc.git
X86-64: Prepare memmove-vec-unaligned-erms.S
Prepare memmove-vec-unaligned-erms.S to make the SSE2 version as the default memcpy, mempcpy and memmove. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (MEMCPY_SYMBOL): New. (MEMPCPY_SYMBOL): Likewise. (MEMMOVE_CHK_SYMBOL): Likewise. Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk symbols. Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on __mempcpy symbols. Provide alias for __memcpy_chk in libc.a. Provide alias for memcpy in libc.a and ld.so.
This commit is contained in:
parent
4af1bb06c5
commit
a7d1c51482
11
ChangeLog
11
ChangeLog
|
|
@ -1,3 +1,14 @@
|
||||||
|
2016-04-06 H.J. Lu <hongjiu.lu@intel.com>
|
||||||
|
|
||||||
|
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||||
|
(MEMCPY_SYMBOL): New.
|
||||||
|
(MEMPCPY_SYMBOL): Likewise.
|
||||||
|
(MEMMOVE_CHK_SYMBOL): Likewise.
|
||||||
|
Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
|
||||||
|
symbols. Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
|
||||||
|
__mempcpy symbols. Provide alias for __memcpy_chk in libc.a.
|
||||||
|
Provide alias for memcpy in libc.a and ld.so.
|
||||||
|
|
||||||
2016-04-06 H.J. Lu <hongjiu.lu@intel.com>
|
2016-04-06 H.J. Lu <hongjiu.lu@intel.com>
|
||||||
|
|
||||||
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
|
|
||||||
|
|
@ -32,18 +32,27 @@
|
||||||
8 * VEC_SIZE at a time.
|
8 * VEC_SIZE at a time.
|
||||||
8. Otherwise, forward copy 8 * VEC_SIZE at a time. */
|
8. Otherwise, forward copy 8 * VEC_SIZE at a time. */
|
||||||
|
|
||||||
#if IS_IN (libc)
|
#include <sysdep.h>
|
||||||
|
|
||||||
# include <sysdep.h>
|
#ifndef MEMCPY_SYMBOL
|
||||||
# include "asm-syntax.h"
|
# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||||
|
#endif
|
||||||
|
|
||||||
# ifndef VZEROUPPER
|
#ifndef MEMPCPY_SYMBOL
|
||||||
# if VEC_SIZE > 16
|
# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||||
# define VZEROUPPER vzeroupper
|
#endif
|
||||||
# else
|
|
||||||
# define VZEROUPPER
|
#ifndef MEMMOVE_CHK_SYMBOL
|
||||||
# endif
|
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef VZEROUPPER
|
||||||
|
# if VEC_SIZE > 16
|
||||||
|
# define VZEROUPPER vzeroupper
|
||||||
|
# else
|
||||||
|
# define VZEROUPPER
|
||||||
# endif
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
|
/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
|
||||||
up REP MOVSB operation, REP MOVSB isn't faster on short data. The
|
up REP MOVSB operation, REP MOVSB isn't faster on short data. The
|
||||||
|
|
@ -52,32 +61,36 @@
|
||||||
on processors with Enhanced REP MOVSB. Since larger register size
|
on processors with Enhanced REP MOVSB. Since larger register size
|
||||||
can move more data with a single load and store, the threshold is
|
can move more data with a single load and store, the threshold is
|
||||||
higher with larger register size. */
|
higher with larger register size. */
|
||||||
# ifndef REP_MOVSB_THRESHOLD
|
#ifndef REP_MOVSB_THRESHOLD
|
||||||
# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
|
# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
|
||||||
# endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef SECTION
|
||||||
|
# error SECTION is not defined!
|
||||||
|
#endif
|
||||||
|
|
||||||
# ifndef SECTION
|
|
||||||
# error SECTION is not defined!
|
|
||||||
# endif
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
.section SECTION(.text),"ax",@progbits
|
||||||
|
#if defined SHARED && IS_IN (libc)
|
||||||
# ifdef SHARED
|
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
|
||||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
|
|
||||||
cmpq %rdx, %rcx
|
cmpq %rdx, %rcx
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||||
END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
|
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
|
||||||
|
#endif
|
||||||
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
|
#if VEC_SIZE == 16 || defined SHARED
|
||||||
|
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
|
||||||
movq %rdi, %rax
|
movq %rdi, %rax
|
||||||
addq %rdx, %rax
|
addq %rdx, %rax
|
||||||
jmp L(start)
|
jmp L(start)
|
||||||
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
|
END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
|
||||||
|
#endif
|
||||||
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
|
#if defined SHARED && IS_IN (libc)
|
||||||
|
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
|
||||||
cmpq %rdx, %rcx
|
cmpq %rdx, %rcx
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||||
END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
|
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
|
||||||
# endif
|
#endif
|
||||||
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
|
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
|
||||||
movq %rdi, %rax
|
movq %rdi, %rax
|
||||||
|
|
@ -86,24 +99,29 @@ L(start):
|
||||||
jb L(less_vec)
|
jb L(less_vec)
|
||||||
cmpq $(VEC_SIZE * 2), %rdx
|
cmpq $(VEC_SIZE * 2), %rdx
|
||||||
ja L(more_2x_vec)
|
ja L(more_2x_vec)
|
||||||
|
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||||
|
L(last_2x_vec):
|
||||||
|
#endif
|
||||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||||
VMOVU (%rsi), %VEC(0)
|
VMOVU (%rsi), %VEC(0)
|
||||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||||
VMOVU %VEC(0), (%rdi)
|
VMOVU %VEC(0), (%rdi)
|
||||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
|
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||||
|
L(nop):
|
||||||
|
#endif
|
||||||
ret
|
ret
|
||||||
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
|
END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
|
||||||
|
|
||||||
# if VEC_SIZE == 16
|
# if VEC_SIZE == 16 && defined SHARED
|
||||||
/* Only used to measure performance of REP MOVSB. */
|
/* Only used to measure performance of REP MOVSB. */
|
||||||
# ifdef SHARED
|
|
||||||
ENTRY (__mempcpy_erms)
|
ENTRY (__mempcpy_erms)
|
||||||
movq %rdi, %rax
|
movq %rdi, %rax
|
||||||
addq %rdx, %rax
|
addq %rdx, %rax
|
||||||
jmp L(start_movsb)
|
jmp L(start_movsb)
|
||||||
END (__mempcpy_erms)
|
END (__mempcpy_erms)
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (__memmove_erms)
|
ENTRY (__memmove_erms)
|
||||||
movq %rdi, %rax
|
movq %rdi, %rax
|
||||||
|
|
@ -132,11 +150,10 @@ strong_alias (__memmove_erms, __memcpy_erms)
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
# ifdef SHARED
|
# ifdef SHARED
|
||||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
|
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||||
cmpq %rdx, %rcx
|
cmpq %rdx, %rcx
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||||
END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
|
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||||
movq %rdi, %rax
|
movq %rdi, %rax
|
||||||
|
|
@ -144,11 +161,10 @@ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||||
jmp L(start_erms)
|
jmp L(start_erms)
|
||||||
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||||
|
|
||||||
# ifdef SHARED
|
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
|
|
||||||
cmpq %rdx, %rcx
|
cmpq %rdx, %rcx
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||||
END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
|
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||||
|
|
@ -192,6 +208,7 @@ L(movsb_more_2x_vec):
|
||||||
/* Force 32-bit displacement to avoid long nop between
|
/* Force 32-bit displacement to avoid long nop between
|
||||||
instructions. */
|
instructions. */
|
||||||
ja.d32 L(movsb)
|
ja.d32 L(movsb)
|
||||||
|
#endif
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(more_2x_vec):
|
L(more_2x_vec):
|
||||||
/* More than 2 * VEC. */
|
/* More than 2 * VEC. */
|
||||||
|
|
@ -227,13 +244,19 @@ L(copy_forward):
|
||||||
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
|
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
|
||||||
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
|
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||||
cmpq $(VEC_SIZE * 8), %rdx
|
cmpq $(VEC_SIZE * 8), %rdx
|
||||||
# if VEC_SIZE == 16
|
#if VEC_SIZE == 16
|
||||||
|
# if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
jbe L(return)
|
jbe L(return)
|
||||||
# else
|
# else
|
||||||
|
/* Use 32-bit displacement to avoid long nop between
|
||||||
|
instructions. */
|
||||||
|
jbe.d32 L(return)
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
/* Use 8-bit displacement to avoid long nop between
|
/* Use 8-bit displacement to avoid long nop between
|
||||||
instructions. */
|
instructions. */
|
||||||
jbe L(return_disp8)
|
jbe L(return_disp8)
|
||||||
# endif
|
#endif
|
||||||
leaq (VEC_SIZE * 4)(%rdi), %rcx
|
leaq (VEC_SIZE * 4)(%rdi), %rcx
|
||||||
addq %rdi, %rdx
|
addq %rdi, %rdx
|
||||||
andq $-(VEC_SIZE * 4), %rdx
|
andq $-(VEC_SIZE * 4), %rdx
|
||||||
|
|
@ -263,22 +286,25 @@ L(loop):
|
||||||
addq $(VEC_SIZE * 4), %rcx
|
addq $(VEC_SIZE * 4), %rcx
|
||||||
cmpq %rcx, %rdx
|
cmpq %rcx, %rdx
|
||||||
jne L(loop)
|
jne L(loop)
|
||||||
|
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||||
|
L(return):
|
||||||
|
#endif
|
||||||
L(return_disp8):
|
L(return_disp8):
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
ret
|
ret
|
||||||
L(less_vec):
|
L(less_vec):
|
||||||
/* Less than 1 VEC. */
|
/* Less than 1 VEC. */
|
||||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||||
# error Unsupported VEC_SIZE!
|
# error Unsupported VEC_SIZE!
|
||||||
# endif
|
#endif
|
||||||
# if VEC_SIZE > 32
|
#if VEC_SIZE > 32
|
||||||
cmpb $32, %dl
|
cmpb $32, %dl
|
||||||
jae L(between_32_63)
|
jae L(between_32_63)
|
||||||
# endif
|
#endif
|
||||||
# if VEC_SIZE > 16
|
#if VEC_SIZE > 16
|
||||||
cmpb $16, %dl
|
cmpb $16, %dl
|
||||||
jae L(between_16_31)
|
jae L(between_16_31)
|
||||||
# endif
|
#endif
|
||||||
cmpb $8, %dl
|
cmpb $8, %dl
|
||||||
jae L(between_8_15)
|
jae L(between_8_15)
|
||||||
cmpb $4, %dl
|
cmpb $4, %dl
|
||||||
|
|
@ -290,7 +316,7 @@ L(less_vec):
|
||||||
movb %cl, (%rdi)
|
movb %cl, (%rdi)
|
||||||
1:
|
1:
|
||||||
ret
|
ret
|
||||||
# if VEC_SIZE > 32
|
#if VEC_SIZE > 32
|
||||||
L(between_32_63):
|
L(between_32_63):
|
||||||
/* From 32 to 63. No branch when size == 32. */
|
/* From 32 to 63. No branch when size == 32. */
|
||||||
vmovdqu (%rsi), %ymm0
|
vmovdqu (%rsi), %ymm0
|
||||||
|
|
@ -299,8 +325,8 @@ L(between_32_63):
|
||||||
vmovdqu %ymm1, -32(%rdi,%rdx)
|
vmovdqu %ymm1, -32(%rdi,%rdx)
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
ret
|
ret
|
||||||
# endif
|
#endif
|
||||||
# if VEC_SIZE > 16
|
#if VEC_SIZE > 16
|
||||||
/* From 16 to 31. No branch when size == 16. */
|
/* From 16 to 31. No branch when size == 16. */
|
||||||
L(between_16_31):
|
L(between_16_31):
|
||||||
vmovdqu (%rsi), %xmm0
|
vmovdqu (%rsi), %xmm0
|
||||||
|
|
@ -308,7 +334,7 @@ L(between_16_31):
|
||||||
vmovdqu %xmm0, (%rdi)
|
vmovdqu %xmm0, (%rdi)
|
||||||
vmovdqu %xmm1, -16(%rdi,%rdx)
|
vmovdqu %xmm1, -16(%rdi,%rdx)
|
||||||
ret
|
ret
|
||||||
# endif
|
#endif
|
||||||
L(between_8_15):
|
L(between_8_15):
|
||||||
/* From 8 to 15. No branch when size == 8. */
|
/* From 8 to 15. No branch when size == 8. */
|
||||||
movq -8(%rsi,%rdx), %rcx
|
movq -8(%rsi,%rdx), %rcx
|
||||||
|
|
@ -331,10 +357,10 @@ L(between_2_3):
|
||||||
movw %si, (%rdi)
|
movw %si, (%rdi)
|
||||||
ret
|
ret
|
||||||
|
|
||||||
# if VEC_SIZE > 16
|
#if VEC_SIZE > 16
|
||||||
/* Align to 16 bytes to avoid long nop between instructions. */
|
/* Align to 16 bytes to avoid long nop between instructions. */
|
||||||
.p2align 4
|
.p2align 4
|
||||||
# endif
|
#endif
|
||||||
L(more_2x_vec_overlap):
|
L(more_2x_vec_overlap):
|
||||||
/* More than 2 * VEC and there is overlap bewteen destination
|
/* More than 2 * VEC and there is overlap bewteen destination
|
||||||
and source. */
|
and source. */
|
||||||
|
|
@ -454,15 +480,19 @@ L(loop_8x_vec_backward):
|
||||||
jmp L(between_4x_vec_and_8x_vec)
|
jmp L(between_4x_vec_and_8x_vec)
|
||||||
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||||
|
|
||||||
# ifdef SHARED
|
#ifdef SHARED
|
||||||
|
# if IS_IN (libc)
|
||||||
|
# ifdef USE_MULTIARCH
|
||||||
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
|
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
|
||||||
MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
|
MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
|
||||||
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
|
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
|
||||||
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
|
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
|
||||||
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
|
# endif
|
||||||
MEMMOVE_SYMBOL (__memcpy, unaligned_2))
|
strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2),
|
||||||
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
|
MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2))
|
||||||
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
|
|
||||||
# endif
|
# endif
|
||||||
|
#endif
|
||||||
|
#if VEC_SIZE == 16 || defined SHARED
|
||||||
|
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
|
||||||
|
MEMCPY_SYMBOL (__memcpy, unaligned_2))
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue