x86: Use "%v" to emit VEX encoded instructions for AVX targets

Legacy encodings of SSE instructions incur AVX-SSE domain transition
penalties on some Intel microarchitectures (e.g. Haswell, Broadwell).
Using the VEX forms avoids these penatlies and keeps all instructions
in the VEX decode domain.  Use "%v" sequence to emit the "v" prefix
for opcodes when compiling with -mavx.

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Reviewed-by: Florian Weimer <fweimer@redhat.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Uros Bizjak 2025-09-21 20:33:09 +02:00
parent 3014dec3ad
commit ff8be6152b
33 changed files with 71 additions and 85 deletions

View File

@ -44,13 +44,13 @@ __feclearexcept (int excepts)
unsigned int xnew_exc;
/* Get the current MXCSR. */
__asm__ ("stmxcsr %0" : "=m" (xnew_exc));
__asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
/* Clear the relevant bits. */
xnew_exc &= ~excepts;
/* Put the new data in effect. */
__asm__ ("ldmxcsr %0" : : "m" (xnew_exc));
__asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
}
/* Success. */

View File

@ -41,11 +41,11 @@ fedisableexcept (int excepts)
unsigned int xnew_exc;
/* Get the current control word. */
__asm__ ("stmxcsr %0" : "=m" (xnew_exc));
__asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
xnew_exc |= excepts << 7;
__asm__ ("ldmxcsr %0" : : "m" (xnew_exc));
__asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
}
return old_exc;

View File

@ -41,11 +41,11 @@ feenableexcept (int excepts)
unsigned int xnew_exc;
/* Get the current control word. */
__asm__ ("stmxcsr %0" : "=m" (xnew_exc));
__asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
xnew_exc &= ~(excepts << 7);
__asm__ ("ldmxcsr %0" : : "m" (xnew_exc));
__asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
}
return old_exc;

View File

@ -30,7 +30,7 @@ __fegetenv (fenv_t *envp)
__asm__ ("fldenv %0" : : "m" (*envp));
if (CPU_FEATURE_USABLE (SSE))
__asm__ ("stmxcsr %0" : "=m" (envp->__eip));
__asm__ ("%vstmxcsr %0" : "=m" (envp->__eip));
/* Success. */
return 0;

View File

@ -26,6 +26,6 @@ fegetmode (femode_t *modep)
{
_FPU_GETCW (modep->__control_word);
if (CPU_FEATURE_USABLE (SSE))
__asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (modep->__mxcsr));
return 0;
}

View File

@ -33,12 +33,12 @@ __feholdexcept (fenv_t *envp)
unsigned int xwork;
/* Get the current control word. */
__asm__ ("stmxcsr %0" : "=m" (envp->__eip));
__asm__ ("%vstmxcsr %0" : "=m" (envp->__eip));
/* Set all exceptions to non-stop and clear them. */
xwork = (envp->__eip | 0x1f80) & ~0x3f;
__asm__ ("ldmxcsr %0" : : "m" (xwork));
__asm__ ("%vldmxcsr %0" : : "m" (xwork));
}
return 0;

View File

@ -80,7 +80,7 @@ __fesetenv (const fenv_t *envp)
if (CPU_FEATURE_USABLE (SSE))
{
unsigned int mxcsr;
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
if (envp == FE_DFL_ENV)
{
@ -111,7 +111,7 @@ __fesetenv (const fenv_t *envp)
else
mxcsr = envp->__eip;
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
}
/* Success. */

View File

@ -33,13 +33,13 @@ fesetexcept (int excepts)
{
/* Get the control word of the SSE unit. */
unsigned int mxcsr;
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
/* Set relevant flags. */
mxcsr |= excepts;
/* Put the new data in effect. */
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
}
else
{

View File

@ -37,7 +37,7 @@ fesetmode (const femode_t *modep)
if (CPU_FEATURE_USABLE (SSE))
{
unsigned int mxcsr;
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
/* Preserve SSE exception flags but restore other state in
MXCSR. */
mxcsr &= FE_ALL_EXCEPT_X86;
@ -47,7 +47,7 @@ fesetmode (const femode_t *modep)
mxcsr |= FE_ALL_EXCEPT_X86 << 7;
else
mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
}
return 0;
}

View File

@ -39,10 +39,10 @@ __fesetround (int round)
{
unsigned int xcw;
__asm__ ("stmxcsr %0" : "=m" (xcw));
__asm__ ("%vstmxcsr %0" : "=m" (xcw));
xcw &= ~0x6000;
xcw |= round << 3;
__asm__ ("ldmxcsr %0" : : "m" (xcw));
__asm__ ("%vldmxcsr %0" : : "m" (xcw));
}
return 0;

View File

@ -31,7 +31,7 @@ __feupdateenv (const fenv_t *envp)
/* If the CPU supports SSE we test the MXCSR as well. */
if (CPU_FEATURE_USABLE (SSE))
__asm__ ("stmxcsr %0" : "=m" (xtemp));
__asm__ ("%vstmxcsr %0" : "=m" (xtemp));
temp = (temp | xtemp) & FE_ALL_EXCEPT;

View File

@ -37,7 +37,7 @@ __fegetexceptflag (fexcept_t *flagp, int excepts)
unsigned int sse_exc;
/* Get the current MXCSR. */
__asm__ ("stmxcsr %0" : "=m" (sse_exc));
__asm__ ("%vstmxcsr %0" : "=m" (sse_exc));
*flagp |= sse_exc & excepts & FE_ALL_EXCEPT;
}

View File

@ -50,13 +50,13 @@ __fesetexceptflag (const fexcept_t *flagp, int excepts)
__asm__ ("fldenv %0" : : "m" (temp));
/* And now similarly for SSE. */
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
/* Clear or set relevant flags. */
mxcsr ^= (mxcsr ^ *flagp) & excepts;
/* Put the new data in effect. */
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
}
else
{

View File

@ -31,7 +31,7 @@ __fetestexcept (int excepts)
/* If the CPU supports SSE we test the MXCSR as well. */
if (CPU_FEATURE_USABLE (SSE))
__asm__ ("stmxcsr %0" : "=m" (xtemp));
__asm__ ("%vstmxcsr %0" : "=m" (xtemp));
return (temp | xtemp) & excepts & FE_ALL_EXCEPT;
}

View File

@ -43,11 +43,11 @@ __setfpucw (fpu_control_t set)
unsigned int xnew_exc;
/* Get the current MXCSR. */
__asm__ ("stmxcsr %0" : "=m" (xnew_exc));
__asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
xnew_exc &= ~((0xc00 << 3) | (FE_ALL_EXCEPT << 7));
xnew_exc |= ((set & 0xc00) << 3) | ((set & FE_ALL_EXCEPT) << 7);
__asm__ ("ldmxcsr %0" : : "m" (xnew_exc));
__asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
}
}

View File

@ -18,22 +18,14 @@
need not care for both the 387 and the sse unit, only the one we're
actually using. */
#if defined __AVX__ || defined SSE2AVX
# define STMXCSR "vstmxcsr"
# define LDMXCSR "vldmxcsr"
#else
# define STMXCSR "stmxcsr"
# define LDMXCSR "ldmxcsr"
#endif
static __always_inline void
libc_feholdexcept_sse (fenv_t *e)
{
unsigned int mxcsr;
asm (STMXCSR " %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
e->__mxcsr = mxcsr;
mxcsr = (mxcsr | 0x1f80) & ~0x3f;
asm volatile (LDMXCSR " %0" : : "m" (mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
}
static __always_inline void
@ -51,9 +43,9 @@ static __always_inline void
libc_fesetround_sse (int r)
{
unsigned int mxcsr;
asm (STMXCSR " %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
mxcsr = (mxcsr & ~0x6000) | (r << 3);
asm volatile (LDMXCSR " %0" : : "m" (mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
}
static __always_inline void
@ -69,10 +61,10 @@ static __always_inline void
libc_feholdexcept_setround_sse (fenv_t *e, int r)
{
unsigned int mxcsr;
asm (STMXCSR " %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
e->__mxcsr = mxcsr;
mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
asm volatile (LDMXCSR " %0" : : "m" (mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
}
/* Set both rounding mode and precision. A convenience function for use
@ -104,7 +96,7 @@ static __always_inline int
libc_fetestexcept_sse (int e)
{
unsigned int mxcsr;
asm volatile (STMXCSR " %0" : "=m" (mxcsr));
asm volatile ("%vstmxcsr %0" : "=m" (mxcsr));
return mxcsr & e & FE_ALL_EXCEPT;
}
@ -119,7 +111,7 @@ libc_fetestexcept_387 (int ex)
static __always_inline void
libc_fesetenv_sse (fenv_t *e)
{
asm volatile (LDMXCSR " %0" : : "m" (e->__mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (e->__mxcsr));
}
static __always_inline void
@ -137,13 +129,13 @@ static __always_inline int
libc_feupdateenv_test_sse (fenv_t *e, int ex)
{
unsigned int mxcsr, old_mxcsr, cur_ex;
asm volatile (STMXCSR " %0" : "=m" (mxcsr));
asm volatile ("%vstmxcsr %0" : "=m" (mxcsr));
cur_ex = mxcsr & FE_ALL_EXCEPT;
/* Merge current exceptions with the old environment. */
old_mxcsr = e->__mxcsr;
mxcsr = old_mxcsr | cur_ex;
asm volatile (LDMXCSR " %0" : : "m" (mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
/* Raise SIGFPE for any new exceptions since the hold. Expect that
the normal environment has all exceptions masked. */
@ -189,10 +181,10 @@ static __always_inline void
libc_feholdsetround_sse (fenv_t *e, int r)
{
unsigned int mxcsr;
asm (STMXCSR " %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
e->__mxcsr = mxcsr;
mxcsr = (mxcsr & ~0x6000) | (r << 3);
asm volatile (LDMXCSR " %0" : : "m" (mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
}
static __always_inline void
@ -223,9 +215,9 @@ static __always_inline void
libc_feresetround_sse (fenv_t *e)
{
unsigned int mxcsr;
asm (STMXCSR " %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
mxcsr = (mxcsr & ~0x6000) | (e->__mxcsr & 0x6000);
asm volatile (LDMXCSR " %0" : : "m" (mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
}
static __always_inline void
@ -315,13 +307,13 @@ static __always_inline void
libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r)
{
unsigned int mxcsr, new_mxcsr;
asm (STMXCSR " %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
ctx->env.__mxcsr = mxcsr;
if (__glibc_unlikely (mxcsr != new_mxcsr))
{
asm volatile (LDMXCSR " %0" : : "m" (new_mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (new_mxcsr));
ctx->updated_status = true;
}
else
@ -412,13 +404,13 @@ libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r)
{
unsigned int mxcsr, new_mxcsr;
asm (STMXCSR " %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
new_mxcsr = (mxcsr & ~0x6000) | (r << 3);
ctx->env.__mxcsr = mxcsr;
if (__glibc_unlikely (new_mxcsr != mxcsr))
{
asm volatile (LDMXCSR " %0" : : "m" (new_mxcsr));
asm volatile ("%vldmxcsr %0" : : "m" (new_mxcsr));
ctx->updated_status = true;
}
else

View File

@ -39,15 +39,9 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
# define FP_RND_MASK 0x6000
# ifdef __AVX__
# define AVX_INSN_PREFIX "v"
# else
# define AVX_INSN_PREFIX ""
# endif
# define FP_INIT_ROUNDMODE \
do { \
__asm__ __volatile__ (AVX_INSN_PREFIX "stmxcsr\t%0" : "=m" (_fcw)); \
__asm__ __volatile__ ("%vstmxcsr\t%0" : "=m" (_fcw)); \
} while (0)
#else
# define _FP_W_TYPE_SIZE 32

View File

@ -29,14 +29,14 @@ static uint32_t
get_sse_mxcsr (void)
{
uint32_t temp;
__asm__ __volatile__ ("stmxcsr %0" : "=m" (temp));
__asm__ __volatile__ ("%vstmxcsr %0" : "=m" (temp));
return temp;
}
static void
set_sse_mxcsr (uint32_t val)
{
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (val));
__asm__ __volatile__ ("%vldmxcsr %0" : : "m" (val));
}
static void

View File

@ -38,13 +38,13 @@ __feclearexcept (int excepts)
__asm__ ("fldenv %0" : : "m" (temp));
/* And the same procedure for SSE. */
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
/* Clear the relevant bits. */
mxcsr &= ~excepts;
/* And put them into effect. */
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
/* Success. */
return 0;

View File

@ -35,11 +35,11 @@ fedisableexcept (int excepts)
__asm__ ("fldcw %0" : : "m" (new_exc));
/* And now the same for the SSE MXCSR register. */
__asm__ ("stmxcsr %0" : "=m" (new));
__asm__ ("%vstmxcsr %0" : "=m" (new));
/* The SSE exception masks are shifted by 7 bits. */
new |= excepts << 7;
__asm__ ("ldmxcsr %0" : : "m" (new));
__asm__ ("%vldmxcsr %0" : : "m" (new));
return old_exc;
}

View File

@ -35,11 +35,11 @@ feenableexcept (int excepts)
__asm__ ("fldcw %0" : : "m" (new_exc));
/* And now the same for the SSE MXCSR register. */
__asm__ ("stmxcsr %0" : "=m" (new));
__asm__ ("%vstmxcsr %0" : "=m" (new));
/* The SSE exception masks are shifted by 7 bits. */
new &= ~(excepts << 7);
__asm__ ("ldmxcsr %0" : : "m" (new));
__asm__ ("%vldmxcsr %0" : : "m" (new));
return old_exc;
}

View File

@ -25,7 +25,7 @@ __fegetenv (fenv_t *envp)
/* fnstenv changes the exception mask, so load back the
stored environment. */
"fldenv %0\n"
"stmxcsr %1" : "=m" (*envp), "=m" (envp->__mxcsr));
"%vstmxcsr %1" : "=m" (*envp), "=m" (envp->__mxcsr));
/* Success. */
return 0;

View File

@ -23,6 +23,6 @@ int
fegetmode (femode_t *modep)
{
_FPU_GETCW (modep->__control_word);
__asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (modep->__mxcsr));
return 0;
}

View File

@ -26,13 +26,13 @@ __feholdexcept (fenv_t *envp)
/* Store the environment. Recall that fnstenv has a side effect of
masking all exceptions. Then clear all exceptions. */
__asm__ ("fnstenv %0\n\t"
"stmxcsr %1\n\t"
"%vstmxcsr %1\n\t"
"fnclex"
: "=m" (*envp), "=m" (envp->__mxcsr));
/* Set the SSE MXCSR register. */
mxcsr = (envp->__mxcsr | 0x1f80) & ~0x3f;
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
return 0;
}

View File

@ -36,7 +36,7 @@ __fesetenv (const fenv_t *envp)
Therefore, we get the current environment and replace the values
we want to use from the environment specified by the parameter. */
__asm__ ("fnstenv %0\n"
"stmxcsr %1" : "=m" (temp), "=m" (temp.__mxcsr));
"%vstmxcsr %1" : "=m" (temp), "=m" (temp.__mxcsr));
if (envp == FE_DFL_ENV)
{
@ -104,7 +104,7 @@ __fesetenv (const fenv_t *envp)
}
__asm__ ("fldenv %0\n"
"ldmxcsr %1" : : "m" (temp), "m" (temp.__mxcsr));
"%vldmxcsr %1" : : "m" (temp), "m" (temp.__mxcsr));
/* Success. */
return 0;

View File

@ -23,9 +23,9 @@ fesetexcept (int excepts)
{
unsigned int mxcsr;
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
mxcsr |= excepts & FE_ALL_EXCEPT;
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
return 0;
}

View File

@ -28,7 +28,7 @@ fesetmode (const femode_t *modep)
{
fpu_control_t cw;
unsigned int mxcsr;
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
/* Preserve SSE exception flags but restore other state in
MXCSR. */
mxcsr &= FE_ALL_EXCEPT_X86;
@ -45,6 +45,6 @@ fesetmode (const femode_t *modep)
mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
}
_FPU_SETCW (cw);
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
return 0;
}

View File

@ -36,10 +36,10 @@ __fesetround (int round)
/* And now the MSCSR register for SSE, the precision is at different bit
positions in the different units, we need to shift it 3 bits. */
asm ("stmxcsr %0" : "=m" (mxcsr));
asm ("%vstmxcsr %0" : "=m" (mxcsr));
mxcsr &= ~ 0x6000;
mxcsr |= round << 3;
asm ("ldmxcsr %0" : : "m" (mxcsr));
asm ("%vldmxcsr %0" : : "m" (mxcsr));
return 0;
}

View File

@ -25,7 +25,7 @@ __feupdateenv (const fenv_t *envp)
unsigned int xtemp;
/* Save current exceptions. */
__asm__ ("fnstsw %0\n\tstmxcsr %1" : "=m" (temp), "=m" (xtemp));
__asm__ ("fnstsw %0\n\t%vstmxcsr %1" : "=m" (temp), "=m" (xtemp));
temp = (temp | xtemp) & FE_ALL_EXCEPT;
/* Install new environment. */

View File

@ -26,7 +26,7 @@ fegetexceptflag (fexcept_t *flagp, int excepts)
/* Get the current exceptions for the x87 FPU and SSE unit. */
__asm__ ("fnstsw %0\n"
"stmxcsr %1" : "=m" (temp), "=m" (mxscr));
"%vstmxcsr %1" : "=m" (temp), "=m" (mxscr));
*flagp = (temp | mxscr) & FE_ALL_EXCEPT & excepts;

View File

@ -33,7 +33,7 @@ __feraiseexcept (int excepts)
/* One example of an invalid operation is 0.0 / 0.0. */
float f = 0.0;
__asm__ __volatile__ ("divss %0, %0 " : "+x" (f));
__asm__ __volatile__ ("%vdivss %0, %0 " : "+x" (f));
(void) &f;
}
@ -43,7 +43,7 @@ __feraiseexcept (int excepts)
float f = 1.0;
float g = 0.0;
__asm__ __volatile__ ("divss %1, %0" : "+x" (f) : "x" (g));
__asm__ __volatile__ ("%vdivss %1, %0" : "+x" (f) : "x" (g));
(void) &f;
}

View File

@ -44,13 +44,13 @@ fesetexceptflag (const fexcept_t *flagp, int excepts)
__asm__ ("fldenv %0" : : "m" (temp));
/* And now similarly for SSE. */
__asm__ ("stmxcsr %0" : "=m" (mxcsr));
__asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
/* Clear or set relevant flags. */
mxcsr ^= (mxcsr ^ *flagp) & excepts;
/* Put the new data in effect. */
__asm__ ("ldmxcsr %0" : : "m" (mxcsr));
__asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
/* Success. */
return 0;

View File

@ -26,7 +26,7 @@ __fetestexcept (int excepts)
/* Get current exceptions. */
__asm__ ("fnstsw %0\n"
"stmxcsr %1" : "=m" (temp), "=m" (mxscr));
"%vstmxcsr %1" : "=m" (temp), "=m" (mxscr));
return (temp | mxscr) & excepts & FE_ALL_EXCEPT;
}