AArch64: Improve codegen of AdvSIMD logf function family

Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
8% improvement in throughput microbenchmark on Neoverse V1 for log2 and log,
and 2% for log10.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Joana Cruz 2024-12-17 14:47:31 +00:00 committed by Wilco Dijkstra
parent f9493a15ea
commit d6e034f5b2
3 changed files with 66 additions and 40 deletions

View File

@ -18,21 +18,25 @@
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#include "v_math.h" #include "v_math.h"
#include "poly_advsimd_f32.h"
static const struct data static const struct data
{ {
float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
uint32x4_t off, offset_lower_bound; uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound; uint16x8_t special_bound;
uint32x4_t mantissa_mask; uint32x4_t mantissa_mask;
float32x4_t poly[8]; float c1, c3, c5, c7;
float32x4_t inv_ln10, ln2;
} data = { } data = {
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
.poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), .c0 = V4 (-0x1.bcb79cp-3f),
V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), .c1 = 0x1.2879c8p-3f,
V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, .c2 = V4 (-0x1.bcd472p-4f),
.c3 = 0x1.6408f8p-4f,
.c4 = V4 (-0x1.246f8p-4f),
.c5 = 0x1.f0e514p-5f,
.c6 = V4 (-0x1.0fc92cp-4f),
.c7 = 0x1.f5f76ap-5f,
.ln2 = V4 (0x1.62e43p-1f), .ln2 = V4 (0x1.62e43p-1f),
.inv_ln10 = V4 (0x1.bcb7b2p-2f), .inv_ln10 = V4 (0x1.bcb7b2p-2f),
/* Lower bound is the smallest positive normal float 0x00800000. For /* Lower bound is the smallest positive normal float 0x00800000. For
@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
float32x4_t c1357 = vld1q_f32 (&d->c1);
/* To avoid having to mov x out of the way, keep u after offset has been /* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case applied, and recover x by adding the offset back in the special-case
handler. */ handler. */
@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
/* y = log10(1+r) + n * log10(2). */ /* y = log10(1+r) + n * log10(2). */
float32x4_t r2 = vmulq_f32 (r, r); float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
float32x4_t poly = vfmaq_f32 (c01, r2, p27);
/* y = Log10(2) * n + poly * InvLn(10). */ /* y = Log10(2) * n + poly * InvLn(10). */
float32x4_t y = vfmaq_f32 (r, d->ln2, n); float32x4_t y = vfmaq_f32 (r, d->ln2, n);
y = vmulq_f32 (y, d->inv_ln10); y = vmulq_f32 (y, d->inv_ln10);

View File

@ -18,22 +18,27 @@
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#include "v_math.h" #include "v_math.h"
#include "poly_advsimd_f32.h"
static const struct data static const struct data
{ {
float32x4_t c0, c2, c4, c6, c8;
uint32x4_t off, offset_lower_bound; uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound; uint16x8_t special_bound;
uint32x4_t mantissa_mask; uint32x4_t mantissa_mask;
float32x4_t poly[9]; float c1, c3, c5, c7;
} data = { } data = {
/* Coefficients generated using Remez algorithm approximate /* Coefficients generated using Remez algorithm approximate
log2(1+r)/r for r in [ -1/3, 1/3 ]. log2(1+r)/r for r in [ -1/3, 1/3 ].
rel error: 0x1.c4c4b0cp-26. */ rel error: 0x1.c4c4b0cp-26. */
.poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), .c1 = -0x1.715458p-1f,
V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), .c2 = V4 (0x1.ec701cp-2f),
V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, .c3 = -0x1.7171a4p-2f,
.c4 = V4 (0x1.27a0b8p-2f),
.c5 = -0x1.e5143ep-3f,
.c6 = V4 (0x1.9d8ecap-3f),
.c7 = -0x1.c675bp-3f,
.c8 = V4 (0x1.9e495p-3f),
/* Lower bound is the smallest positive normal float 0x00800000. For /* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
/* y = log2(1+r) + n. */ /* y = log2(1+r) + n. */
float32x4_t r2 = vmulq_f32 (r, r); float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
float32x4_t c1357 = vld1q_f32 (&d->c1);
float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
float32x4_t p = vfmaq_f32 (c01, r2, p28);
if (__glibc_unlikely (v_any_u16h (special))) if (__glibc_unlikely (v_any_u16h (special)))
return special_case (n, u_off, p, r, special, d); return special_case (n, u_off, p, r, special, d);
return vfmaq_f32 (n, p, r); return vfmaq_f32 (n, p, r);
} }
libmvec_hidden_def (V_NAME_F1 (log2)) libmvec_hidden_def (V_NAME_F1 (log2))
HALF_WIDTH_ALIAS_F1 (log2) HALF_WIDTH_ALIAS_F1 (log2)

View File

@ -21,16 +21,19 @@
static const struct data static const struct data
{ {
uint32x4_t off, offset_lower_bound; float32x4_t c2, c4, c6, ln2;
uint32x4_t off, offset_lower_bound, mantissa_mask;
uint16x8_t special_bound; uint16x8_t special_bound;
uint32x4_t mantissa_mask; float c1, c3, c5, c0;
float32x4_t poly[7];
float32x4_t ln2;
} data = { } data = {
/* 3.34 ulp error. */ /* 3.34 ulp error. */
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), .c0 = -0x1.3e737cp-3f,
V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), .c1 = 0x1.5a9aa2p-3f,
V4 (-0x1.ffffc8p-2f) }, .c2 = V4 (-0x1.4f9934p-3f),
.c3 = 0x1.961348p-3f,
.c4 = V4 (-0x1.00187cp-2f),
.c5 = 0x1.555d7cp-2f,
.c6 = V4 (-0x1.ffffc8p-2f),
.ln2 = V4 (0x1.62e43p-1f), .ln2 = V4 (0x1.62e43p-1f),
/* Lower bound is the smallest positive normal float 0x00800000. For /* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been optimised register use subnormals are detected after offset has been
@ -41,8 +44,6 @@ static const struct data
.mantissa_mask = V4 (0x007fffff) .mantissa_mask = V4 (0x007fffff)
}; };
#define P(i) d->poly[7 - i]
static float32x4_t VPCS_ATTR NOINLINE static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
uint16x4_t cmp, const struct data *d) uint16x4_t cmp, const struct data *d)
@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
float32x4_t n, p, q, r, r2, y; float32x4_t c1350 = vld1q_f32 (&d->c1);
uint32x4_t u, u_off;
uint16x4_t cmp;
/* To avoid having to mov x out of the way, keep u after offset has been /* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case applied, and recover x by adding the offset back in the special-case
handler. */ handler. */
u_off = vreinterpretq_u32_f32 (x); uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u_off = vsubq_u32 (u_off, d->off); float32x4_t n = vcvtq_f32_s32 (
n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
u = vandq_u32 (u_off, d->mantissa_mask); uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
u = vaddq_u32 (u, d->off); vget_low_u16 (d->special_bound));
r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
vget_low_u16 (d->special_bound)); float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log(1+r) + n*ln2. */ /* y = log(1+r) + n*ln2. */
r2 = vmulq_f32 (r, r); float32x4_t r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
p = vfmaq_f32 (P (5), P (6), r); float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
q = vfmaq_f32 (P (3), P (4), r); float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
y = vfmaq_f32 (P (1), P (2), r); float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
p = vfmaq_f32 (p, P (7), r2); p = vfmaq_laneq_f32 (p, r2, c1350, 3);
q = vfmaq_f32 (q, p, r2); q = vfmaq_f32 (q, p, r2);
y = vfmaq_f32 (y, q, r2); y = vfmaq_f32 (y, q, r2);
p = vfmaq_f32 (r, d->ln2, n); p = vfmaq_f32 (r, d->ln2, n);