AArch64: Implement AdvSIMD and SVE log10p1(f) routines

Vector variants of the new C23 log10p1 routines. Note: Benchmark inputs for log10p1(f) are identical to log1p(f) Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2025-09-27 10:37:29 +00:00 · 2025-09-27 10:37:29 +00:00 · 653e6c4fff
parent db42732474
commit 653e6c4fff
18 changed files with 8765 additions and 1 deletions
--- a/benchtests/libmvec/log10p1-inputs
+++ b/benchtests/libmvec/log10p1-inputs
--- a/benchtests/libmvec/log10p1f-inputs
+++ b/benchtests/libmvec/log10p1f-inputs
--- a/bits/libm-simd-decl-stubs.h
+++ b/bits/libm-simd-decl-stubs.h
@ -253,6 +253,17 @@
 #define __DECL_SIMD_log10f64x
 #define __DECL_SIMD_log10f128x

+#define __DECL_SIMD_log10p1
+#define __DECL_SIMD_log10p1f
+#define __DECL_SIMD_log10p1l
+#define __DECL_SIMD_log10p1f16
+#define __DECL_SIMD_log10p1f32
+#define __DECL_SIMD_log10p1f64
+#define __DECL_SIMD_log10p1f128
+#define __DECL_SIMD_log10p1f32x
+#define __DECL_SIMD_log10p1f64x
+#define __DECL_SIMD_log10p1f128x
+
 #define __DECL_SIMD_log2
 #define __DECL_SIMD_log2f
 #define __DECL_SIMD_log2l
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@ -145,7 +145,7 @@ __MATHCALL_VEC (exp10m1,, (_Mdouble_ __x));
 __MATHCALL_VEC (log2p1,, (_Mdouble_ __x));

 /* Return log10(1 + X).  */
-__MATHCALL (log10p1,, (_Mdouble_ __x));
+__MATHCALL_VEC (log10p1,, (_Mdouble_ __x));

 /* Return log(1 + X).  */
 __MATHCALL_VEC (logp1,, (_Mdouble_ __x));
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@ -24,6 +24,7 @@ libmvec-supported-funcs = acos \
                          hypot \
                          log \
                          log10 \
+                          log10p1 \
                          log1p \
                          log2 \
                          log2p1 \
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@ -195,5 +195,10 @@ libmvec {
    _ZGVnN4v_log2p1f;
    _ZGVsMxv_log2p1;
    _ZGVsMxv_log2p1f;
+    _ZGVnN2v_log10p1;
+    _ZGVnN2v_log10p1f;
+    _ZGVnN4v_log10p1f;
+    _ZGVsMxv_log10p1;
+    _ZGVsMxv_log10p1f;
  }
 }
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@ -40,6 +40,7 @@ libmvec_hidden_proto (V_NAME_F1(exp2m1));
 libmvec_hidden_proto (V_NAME_F1(exp10m1));
 libmvec_hidden_proto (V_NAME_F2(hypot));
 libmvec_hidden_proto (V_NAME_F1(log10));
+libmvec_hidden_proto (V_NAME_F1(log10p1));
 libmvec_hidden_proto (V_NAME_F1(log1p));
 libmvec_hidden_proto (V_NAME_F1(log2));
 libmvec_hidden_proto (V_NAME_F1(log2p1));
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@ -133,6 +133,10 @@
 # define __DECL_SIMD_log10 __DECL_SIMD_aarch64
 # undef __DECL_SIMD_log10f
 # define __DECL_SIMD_log10f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_log10p1
+# define __DECL_SIMD_log10p1 __DECL_SIMD_aarch64
+# undef __DECL_SIMD_log10p1f
+# define __DECL_SIMD_log10p1f __DECL_SIMD_aarch64
 # undef __DECL_SIMD_log1p
 # define __DECL_SIMD_log1p __DECL_SIMD_aarch64
 # undef __DECL_SIMD_log1pf
@ -229,6 +233,7 @@ __vpcs __f32x4_t _ZGVnN4v_exp10m1f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_log10p1f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log2p1f (__f32x4_t);
@ -267,6 +272,7 @@ __vpcs __f64x2_t _ZGVnN2v_exp10m1 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_log10p1 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log2p1 (__f64x2_t);
@ -310,6 +316,7 @@ __sv_f32_t _ZGVsMxv_exp10m1f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_log10p1f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log2p1f (__sv_f32_t, __sv_bool_t);
@ -348,6 +355,7 @@ __sv_f64_t _ZGVsMxv_exp10m1 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_log10p1 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log2p1 (__sv_f64_t, __sv_bool_t);
--- a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
@ -67,6 +67,8 @@
 !GCC$ builtin (log) attributes simd (notinbranch)
 !GCC$ builtin (log10) attributes simd (notinbranch)
 !GCC$ builtin (log10f) attributes simd (notinbranch)
+!GCC$ builtin (log10p1) attributes simd (notinbranch)
+!GCC$ builtin (log10p1f) attributes simd (notinbranch)
 !GCC$ builtin (log1p) attributes simd (notinbranch)
 !GCC$ builtin (log1pf) attributes simd (notinbranch)
 !GCC$ builtin (log2) attributes simd (notinbranch)
--- a/sysdeps/aarch64/fpu/log10p1_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10p1_advsimd.c
@ -0,0 +1,142 @@
+/* Double-precision vector log10p1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t c1, c2, c3, c4, c6, c8, c10, c12, c14, c16, c18, c20;
+  double c5, c7, c9, c11, c13, c15, c17, c19;
+  double inv_log2_10, inv_ln10, one, minus_one;
+  int64x2_t one_top;
+  uint64x2_t one_m_hf_rt2_top, umask, hf_rt2_top, bottom_mask;
+  uint64x2_t inf, minf, nan;
+} data = {
+  /* Coefficients generated using FPMinimax deg=20, in
+   [sqrt(2)/2-1, sqrt(2)-1].  */
+  .c1 = V2 (0x1.bcb7b1526e50fp-2),
+  .c2 = V2 (-0x1.bcb7b1526e4fep-3),
+  .c3 = V2 (0x1.287a7636f3cfp-3),
+  .c4 = V2 (-0x1.bcb7b152707cap-4),
+  .c5 = 0x1.63c62775e6667p-4,
+  .c6 = V2 (-0x1.287a76368cf37p-4),
+  .c7 = 0x1.fc3fa57ed69cep-5,
+  .c8 = V2 (-0x1.bcb7b0eed8335p-5),
+  .c9 = 0x1.8b4e10d4ecd69p-5,
+  .c10 = V2 (-0x1.63c65554f2db4p-5),
+  .c11 = 0x1.436b003e5358p-5,
+  .c12 = V2 (-0x1.2872d378b6363p-5),
+  .c13 = 0x1.11e15dd8ac0efp-5,
+  .c14 = V2 (-0x1.fd8dc08e6b21p-6),
+  .c15 = 0x1.d6fabf7e5c622p-6,
+  .c16 = V2 (-0x1.ad53855566e62p-6),
+  .c17 = 0x1.a9547f6043884p-6,
+  .c18 = V2 (-0x1.e4a167fcd3e22p-6),
+  .c19 = 0x1.c2f6859a15a65p-6,
+  .c20 = V2 (-0x1.91c6df82d809bp-7),
+  .hf_rt2_top = V2 (0x3fe6a09e00000000),
+  .one_m_hf_rt2_top = V2 (0x00095f6200000000),
+  .umask = V2 (0x000fffff00000000),
+  .one_top = V2 (0x3ff),
+  .inv_ln10 = 0x1.bcb7b1526e50ep-2,
+  .inv_log2_10 = 0x1.34413509f79ffp-2,
+  .inf = V2 (0x7ff0000000000000),
+  .minf = V2 (0xfff0000000000000),
+  .nan = V2 (0x7fffffffffffffff),
+  .bottom_mask = V2 (0xffffffff),
+  .minus_one = -1.0f,
+  .one = 1.0f,
+};
+
+static inline float64x2_t
+special_case (const struct data *d, float64x2_t x, float64x2_t y,
+	      uint64x2_t cmp)
+{
+  uint64x2_t ret_inf = vcgeq_f64 (x, vreinterpretq_f64_u64 (d->inf));
+  uint64x2_t neg_val
+      = vbslq_u64 (vcgeq_f64 (x, v_f64 (d->minus_one)), d->minf, d->nan);
+  float64x2_t s = vreinterpretq_f64_u64 (vbslq_u64 (ret_inf, d->inf, neg_val));
+  return vbslq_f64 (cmp, s, y);
+}
+
+/* Vector log10p1 approximation using polynomial on reduced interval.
+   Worst-case error is 2.69 ULP:
+   _ZGVnN2v_log10p1(-0x1.2582542cd267p-15) got -0x1.fde2ee0eb629p-17
+					  want -0x1.fde2ee0eb628dp-17 .  */
+VPCS_ATTR float64x2_t V_NAME_D1 (log10p1) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Calculate scaling factor k.  */
+  float64x2_t m = vaddq_f64 (x, v_f64 (d->one));
+  uint64x2_t mi = vreinterpretq_u64_f64 (m);
+  uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+  int64x2_t ki
+      = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+  float64x2_t k = vcvtq_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt (2)].  */
+  uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+  uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, d->bottom_mask));
+  float64x2_t f
+      = vaddq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (d->minus_one));
+  /* Correction term c/m.  */
+  float64x2_t cm
+      = vdivq_f64 (vsubq_f64 (x, vaddq_f64 (m, v_f64 (d->minus_one))), m);
+
+  /* Order-18 Pairwise Horner evaluation scheme.  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1719 = vld1q_f64 (&d->c17);
+  float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, f, c1719, 1);
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, f, c1719, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, f, c1315, 1);
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, f, c1315, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, f, c911, 1);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, c911, 0);
+
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, c57, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, c57, 0);
+  float64x2_t p23 = vfmaq_f64 (d->c2, f, d->c3);
+  float64x2_t p = vfmaq_f64 (p1819, f2, d->c20);
+  p = vfmaq_f64 (p1617, f2, p);
+  p = vfmaq_f64 (p1415, f2, p);
+  p = vfmaq_f64 (p1213, f2, p);
+  p = vfmaq_f64 (p1011, f2, p);
+  p = vfmaq_f64 (p89, f2, p);
+  p = vfmaq_f64 (p67, f2, p);
+  p = vfmaq_f64 (p45, f2, p);
+  p = vfmaq_f64 (p23, f2, p);
+  p = vfmaq_f64 (d->c1, f, p);
+
+  float64x2_t inv_log_consts = vld1q_f64 (&d->inv_log2_10);
+  /* Assemble log10p1(x) = k/log2(10) + log10p1(f) + c/(m * ln10).  */
+  float64x2_t y = vfmaq_laneq_f64 (vmulq_f64 (p, f), k, inv_log_consts, 0);
+  y = vfmaq_laneq_f64 (y, cm, inv_log_consts, 1);
+
+  /* Special cases: x == (-inf), x == nan, x <= -1 .  */
+  uint64x2_t special
+      = vorrq_u64 (vcleq_f64 (x, v_f64 (d->minus_one)),
+		   vcgeq_f64 (x, vreinterpretq_f64_u64 (d->inf)));
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (d, x, y, special);
+  return y;
+}
--- a/sysdeps/aarch64/fpu/log10p1_sve.c
+++ b/sysdeps/aarch64/fpu/log10p1_sve.c
@ -0,0 +1,137 @@
+/* Double-precision vector log10p1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  double c2, c4, c6, c8, c10, c12, c14, c16, c18, c20;
+  double inv_log2_10, inv_ln10;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+  uint64_t hf_rt2_top, one_m_hf_rt2_top, umask, bottom_mask;
+  int64_t one_top;
+} data = {
+  /* Coefficients generated using FPMinimax deg=20, in
+   [sqrt(2)/2-1, sqrt(2)-1].  */
+  .c1 = 0x1.bcb7b1526e50fp-2,
+  .c2 = -0x1.bcb7b1526e4fep-3,
+  .c3 = 0x1.287a7636f3cfp-3,
+  .c4 = -0x1.bcb7b152707cap-4,
+  .c5 = 0x1.63c62775e6667p-4,
+  .c6 = -0x1.287a76368cf37p-4,
+  .c7 = 0x1.fc3fa57ed69cep-5,
+  .c8 = -0x1.bcb7b0eed8335p-5,
+  .c9 = 0x1.8b4e10d4ecd69p-5,
+  .c10 = -0x1.63c65554f2db4p-5,
+  .c11 = 0x1.436b003e5358p-5,
+  .c12 = -0x1.2872d378b6363p-5,
+  .c13 = 0x1.11e15dd8ac0efp-5,
+  .c14 = -0x1.fd8dc08e6b21p-6,
+  .c15 = 0x1.d6fabf7e5c622p-6,
+  .c16 = -0x1.ad53855566e62p-6,
+  .c17 = 0x1.a9547f6043884p-6,
+  .c18 = -0x1.e4a167fcd3e22p-6,
+  .c19 = 0x1.c2f6859a15a65p-6,
+  .c20 = -0x1.91c6df82d809bp-7,
+  .hf_rt2_top = 0x3fe6a09e00000000,
+  .one_m_hf_rt2_top = 0x00095f6200000000,
+  .one_top = 0x3ff,
+  .inv_ln10 = 0x1.bcb7b1526e50ep-2,
+  .inv_log2_10 = 0x1.34413509f79ffp-2,
+  .umask = 0x000fffff00000000,
+  .bottom_mask = 0x00000000ffffffff,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special, svbool_t pg)
+{
+
+  y = svsel (special, sv_f64 (NAN), y);
+  svbool_t ret_pinf = svcmpeq_f64 (pg, x, sv_f64 (INFINITY));
+  svbool_t ret_minf = svcmpeq_f64 (pg, x, sv_f64 (-1.0));
+  y = svsel (ret_pinf, sv_f64 (INFINITY), y);
+  return svsel (ret_minf, sv_f64 (-INFINITY), y);
+}
+
+/* Vector log10p1 approximation using polynomial on reduced interval.
+   Worst-case error is 2.81 ULP:
+   _ZGVsMxv_log10p1(0x1.25c3f17d7602p-53) got 0x1.fe52a1624aad1p-55
+					 want 0x1.fe52a1624aacep-55.  */
+svfloat64_t SV_NAME_D1 (log10p1) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Calculate scaling factor k.  */
+  svfloat64_t m = svsub_x (pg, x, -1.0);
+  svuint64_t mi = svreinterpret_u64 (m);
+  svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top);
+
+  svint64_t ki
+      = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top);
+  svfloat64_t k = svcvt_f64_x (pg, ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  svuint64_t utop = svadd_x (pg, svand_x (pg, u, d->umask), d->hf_rt2_top);
+  svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask));
+  svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1);
+
+  /* Correction term c/m.  */
+  svfloat64_t c = svsub_x (pg, x, svsub_x (svptrue_b64 (), m, 1));
+  svfloat64_t cm = svdiv_x (pg, c, m);
+  /* Order-18 Pairwise Horner evaluation scheme.  */
+  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f);
+
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+  svfloat64_t c1719 = svld1rq (svptrue_b64 (), &d->c17);
+  svfloat64_t c20_inv_ln2 = svld1rq (svptrue_b64 (), &d->c20);
+
+  svfloat64_t p1819 = svmla_lane_f64 (sv_f64 (d->c18), f, c1719, 1);
+  svfloat64_t p1617 = svmla_lane_f64 (sv_f64 (d->c16), f, c1719, 0);
+  svfloat64_t p1415 = svmla_lane_f64 (sv_f64 (d->c14), f, c1315, 1);
+  svfloat64_t p1213 = svmla_lane_f64 (sv_f64 (d->c12), f, c1315, 0);
+  svfloat64_t p1011 = svmla_lane_f64 (sv_f64 (d->c10), f, c911, 1);
+  svfloat64_t p89 = svmla_lane_f64 (sv_f64 (d->c8), f, c911, 0);
+  svfloat64_t p67 = svmla_lane_f64 (sv_f64 (d->c6), f, c57, 1);
+  svfloat64_t p45 = svmla_lane_f64 (sv_f64 (d->c4), f, c57, 0);
+  svfloat64_t p23 = svmla_lane_f64 (sv_f64 (d->c2), f, c13, 1);
+  svfloat64_t p = svmla_lane_f64 (p1819, f2, c20_inv_ln2, 0);
+  p = svmla_x (pg, p1617, f2, p);
+  p = svmla_x (pg, p1415, f2, p);
+  p = svmla_x (pg, p1213, f2, p);
+  p = svmla_x (pg, p1011, f2, p);
+  p = svmla_x (pg, p89, f2, p);
+  p = svmla_x (pg, p67, f2, p);
+  p = svmla_x (pg, p45, f2, p);
+  p = svmla_x (pg, p23, f2, p);
+  p = svmla_x (pg, sv_f64 (d->c1), f, p);
+
+  /* Assemble log10p1(x) = k*log10(2) + log10p1(f) + c/(m * ln2).  */
+  svfloat64_t inv_log_consts = svld1rq_f64 (svptrue_b64 (), &d->inv_log2_10);
+  svfloat64_t y = svmla_lane_f64 (svmul_x (pg, p, f), k, inv_log_consts, 0);
+  y = svmla_lane_f64 (y, cm, inv_log_consts, 1);
+  /* Special cases: x == (-inf), x == nan, x <= -1 .  */
+  svbool_t special = svorn_z (pg, svcmple (svptrue_b64 (), x, sv_f64 (-1.0)),
+			      svcmplt (pg, x, sv_f64 (INFINITY)));
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, y, special, pg);
+  return y;
+}
--- a/sysdeps/aarch64/fpu/log10p1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10p1f_advsimd.c
@ -0,0 +1,123 @@
+/* Single-precision vector log10p1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  uint32x4_t four;
+  int32x4_t three_quarters;
+  float32x4_t c2, c4, c6, c8, c10, c12, small_log10_2;
+  float c1, c3, c5, c7, c9, c11, one_quarter;
+  float32x4_t pinf, minf, nan;
+} data = {
+  /* Polynomial generated using FPMinimax in [-0.25, 0.5].  */
+  .c1 = 0x1.bcb7b2p-2,
+  .c2 = V4 (-0x1.bcb79ep-3),
+  .c3 = 0x1.287984p-3,
+  .c4 = V4 (-0x1.bcc0d4p-4),
+  .c5 = 0x1.642986p-4,
+  .c6 = V4 (-0x1.2815bcp-4),
+  .c7 = 0x1.ec8a4p-5,
+  .c8 = V4 (-0x1.ac4418p-5),
+  .c9 = 0x1.f155a8p-5,
+  .c10 = V4 (-0x1.34b422p-4),
+  .c11 = 0x1.bc5418p-5,
+  .c12 = V4 (-0x1.911e6ep-7),
+  .four = V4 (0x40800000),
+  .three_quarters = V4 (0x3f400000),
+  .one_quarter = 0.25f,
+  .pinf = V4 (INFINITY),
+  .minf = V4 (-INFINITY),
+  .nan = V4 (NAN),
+  /* small_log_2 = log10(2) * 2^-23.  */
+  .small_log10_2 = V4 (0x1.344136p-25f),
+};
+
+static inline float32x4_t
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp,
+	      const struct data *d)
+{
+  y = vbslq_f32 (cmp, d->nan, y);
+  uint32x4_t ret_pinf = vceqq_f32 (x, d->pinf);
+  uint32x4_t ret_minf = vceqq_f32 (x, v_f32 (-1.0));
+  y = vbslq_f32 (ret_pinf, d->pinf, y);
+  return vbslq_f32 (ret_minf, d->minf, y);
+}
+
+/* Vector log10p1f approximation using polynomial on reduced interval.
+   Worst-case error is 3.39 ULP:
+   _ZGVnN4v_log10p1f(0x1.8789fcp-2) got 0x1.000002p+1
+				   want 0x1.fffffep+0.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10p1) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+      is in [-0.25, 0.5]):
+    log10p1(x) = log10(t) + log(2^k) = log10p1(m) + k * log10(2).
+
+    We approximate log10p1(m) with a polynomial, then scale by
+    k. Instead of doing this directly, we use an intermediate
+    scale factor s = 4*k to ensure the scale is representable
+    as a normalised fp32 number.  */
+  float32x4_t m = vsubq_f32 (x, v_f32 (-1.0f));
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  int32x4_t k
+      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+		   vreinterpretq_s32_f32 (d->minf));
+  uint32x4_t ku = vreinterpretq_u32_s32 (k);
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+  /* Scale x by exponent manipulation.  */
+  float32x4_t m_scale
+      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+  float32x4_t consts = vld1q_f32 (&d->c9);
+  m_scale = vaddq_f32 (m_scale, vfmaq_laneq_f32 (v_f32 (-1.0f), s, consts, 2));
+
+  float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), d->small_log10_2);
+  float32x4_t m2 = vmulq_f32 (m_scale, m_scale);
+
+  /* Order-12 pairwise Horner.  */
+  float32x4_t c1357 = vld1q_f32 (&d->c1);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m_scale, c1357, 1);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m_scale, c1357, 2);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m_scale, c1357, 3);
+  float32x4_t p89 = vfmaq_laneq_f32 (d->c8, m_scale, consts, 0);
+  float32x4_t p1011 = vfmaq_laneq_f32 (d->c10, m_scale, consts, 1);
+
+  float32x4_t p = vfmaq_f32 (p1011, m2, d->c12);
+  p = vfmaq_f32 (p89, m2, p);
+  p = vfmaq_f32 (p67, m2, p);
+  p = vfmaq_f32 (p45, m2, p);
+  p = vfmaq_f32 (p23, m2, p);
+  float32x4_t scaled_c1 = vfmaq_laneq_f32 (scale_back, m_scale, c1357, 0);
+  uint32x4_t special_cases = vorrq_u32 (vmvnq_u32 (vcaltq_f32 (x, d->pinf)),
+					vcleq_f32 (x, v_f32 (-1.0)));
+  if (__glibc_unlikely (v_any_u32 (special_cases)))
+    return special_case (x, vfmaq_f32 (scaled_c1, m2, p), special_cases, d);
+
+  return vfmaq_f32 (scaled_c1, m2, p);
+}
+
+libmvec_hidden_def (V_NAME_F1 (log10p1))
+HALF_WIDTH_ALIAS_F1 (log10p1)
--- a/sysdeps/aarch64/fpu/log10p1f_sve.c
+++ b/sysdeps/aarch64/fpu/log10p1f_sve.c
@ -0,0 +1,125 @@
+/* Single-precision vector log10p1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  uint32_t four;
+  int32_t three_quarters;
+  float32_t c2, c4, c6, c8, c10, c12;
+  float32_t c1, c3, c5, c7, c9, c11, small_log10_2;
+} data = {
+  /* Polynomial generated using FPMinimax in [-0.25, 0.5].  */
+  .c1 = 0x1.bcb7b2p-2,
+  .c2 = -0x1.bcb79ep-3,
+  .c3 = 0x1.287984p-3,
+  .c4 = -0x1.bcc0d4p-4,
+  .c5 = 0x1.642986p-4,
+  .c6 = -0x1.2815bcp-4,
+  .c7 = 0x1.ec8a4p-5,
+  .c8 = -0x1.ac4418p-5,
+  .c9 = 0x1.f155a8p-5,
+  .c10 = -0x1.34b422p-4,
+  .c11 = 0x1.bc5418p-5,
+  .c12 = -0x1.911e6ep-7,
+  .four = 0x40800000,
+  .three_quarters = 0x3f400000,
+  /* 2^-23 * log10(2) used for scaling.  */
+  .small_log10_2 = 0x1.344136p-25f,
+};
+
+#define SignedExpMask sv_s32 (0xff800000)
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, const svbool_t pg,
+	      svbool_t special)
+{
+  y = svsel (special, sv_f32 (NAN), y);
+  svbool_t ret_pinf = svcmpeq_f32 (pg, x, sv_f32 (INFINITY));
+  svbool_t ret_minf = svcmpeq_f32 (pg, x, sv_f32 (-1.0));
+  y = svsel (ret_pinf, sv_f32 (INFINITY), y);
+  return svsel (ret_minf, sv_f32 (-INFINITY), y);
+}
+
+/* Vector log10p1f approximation using polynomial on reduced interval.
+   Worst-case error is 3.40 ULP:
+   _ZGVsMxv_log10p1f(0x1.8bfff6p+6) got 0x1.000002p+1
+				   want 0x1.fffffep+0.  */
+svfloat32_t SV_NAME_F1 (log10p1) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+      is in [-0.25, 0.5]):
+    log10p1(x) = log10(t) + log(2^k) = log10p1(m) + k * log10(2).
+
+    We approximate log10p1(m) with a polynomial, then scale by
+    k. Instead of doing this directly, we use an intermediate
+    scale factor s = 4*k to ensure the scale is representable
+    as a normalised fp32 number.  */
+
+  svfloat32_t m = svsub_x (svptrue_b32 (), x, -1.0f);
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  svint32_t k = svand_x (
+      pg, svsub_x (svptrue_b32 (), svreinterpret_s32 (m), d->three_quarters),
+      SignedExpMask);
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+
+  /* Scale x by exponent manipulation.  */
+  svfloat32_t m_scale = svreinterpret_f32_u32 (
+      svsub_x (svptrue_b32 (), svreinterpret_u32 (x), svreinterpret_u32 (k)));
+  m_scale = svadd_x (svptrue_b32 (), m_scale,
+		     svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
+
+  svfloat32_t scale_back
+      = svmul_x (svptrue_b32 (), svcvt_f32_x (svptrue_b32 (), k),
+		 sv_f32 (d->small_log10_2));
+  svfloat32_t m2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
+
+  /* Order-12 pairwise Horner.  */
+  svfloat32_t c1357 = svld1rq (svptrue_b32 (), &d->c1);
+  svfloat32_t c911 = svld1rq (svptrue_b32 (), &d->c9);
+  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), m_scale, c1357, 1);
+  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), m_scale, c1357, 2);
+  svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), m_scale, c1357, 3);
+  svfloat32_t p89 = svmla_lane (sv_f32 (d->c8), m_scale, c911, 0);
+  svfloat32_t p1011 = svmla_lane (sv_f32 (d->c10), m_scale, c911, 1);
+
+  svfloat32_t p = svmla_x (pg, p1011, m2, d->c12);
+  p = svmla_x (pg, p89, m2, p);
+  p = svmla_x (pg, p67, m2, p);
+  p = svmla_x (pg, p45, m2, p);
+  p = svmla_x (pg, p23, m2, p);
+
+  /* Scaling factor m_scale is multiplied with c1 coeff, then added to p.  */
+  svfloat32_t scaled_c1 = svmla_lane (scale_back, m_scale, c1357, 0);
+
+  /* Special cases: x <= -1, x == inf, x == nan.  */
+  svbool_t special_cases
+      = svorn_z (pg, svcmple (svptrue_b32 (), x, sv_f32 (-1.0)),
+		 svcmplt (pg, x, sv_f32 (INFINITY)));
+  if (__glibc_unlikely (svptest_any (pg, special_cases)))
+    return special_case (x, svmla_x (pg, scaled_c1, m2, p), pg, special_cases);
+  return svmla_x (pg, scaled_c1, m2, p);
+}
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@ -49,6 +49,7 @@ VPCS_VECTOR_WRAPPER (exp10m1_advsimd, _ZGVnN2v_exp10m1)
 VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
 VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
 VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
+VPCS_VECTOR_WRAPPER (log10p1_advsimd, _ZGVnN2v_log10p1)
 VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
 VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
 VPCS_VECTOR_WRAPPER (log2p1_advsimd, _ZGVnN2v_log2p1)
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@ -68,6 +68,7 @@ SVE_VECTOR_WRAPPER (exp10m1_sve, _ZGVsMxv_exp10m1)
 SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
 SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
 SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
+SVE_VECTOR_WRAPPER (log10p1_sve, _ZGVsMxv_log10p1)
 SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
 SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
 SVE_VECTOR_WRAPPER (log2p1_sve, _ZGVsMxv_log2p1)
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@ -49,6 +49,7 @@ VPCS_VECTOR_WRAPPER (exp10m1f_advsimd, _ZGVnN4v_exp10m1f)
 VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
 VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
 VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
+VPCS_VECTOR_WRAPPER (log10p1f_advsimd, _ZGVnN4v_log10p1f)
 VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
 VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
 VPCS_VECTOR_WRAPPER (log2p1f_advsimd, _ZGVnN4v_log2p1f)
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@ -68,6 +68,7 @@ SVE_VECTOR_WRAPPER (exp10m1f_sve, _ZGVsMxv_exp10m1f)
 SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
 SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
 SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
+SVE_VECTOR_WRAPPER (log10p1f_sve, _ZGVsMxv_log10p1f)
 SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
 SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
 SVE_VECTOR_WRAPPER (log2p1f_sve, _ZGVsMxv_log2p1f)
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@ -172,14 +172,19 @@ GLIBC_2.43 _ZGVnN2v_exp10m1 F
 GLIBC_2.43 _ZGVnN2v_exp10m1f F
 GLIBC_2.43 _ZGVnN2v_exp2m1 F
 GLIBC_2.43 _ZGVnN2v_exp2m1f F
+GLIBC_2.43 _ZGVnN2v_log10p1 F
+GLIBC_2.43 _ZGVnN2v_log10p1f F
 GLIBC_2.43 _ZGVnN2v_log2p1 F
 GLIBC_2.43 _ZGVnN2v_log2p1f F
 GLIBC_2.43 _ZGVnN4v_exp10m1f F
 GLIBC_2.43 _ZGVnN4v_exp2m1f F
+GLIBC_2.43 _ZGVnN4v_log10p1f F
 GLIBC_2.43 _ZGVnN4v_log2p1f F
 GLIBC_2.43 _ZGVsMxv_exp10m1 F
 GLIBC_2.43 _ZGVsMxv_exp10m1f F
 GLIBC_2.43 _ZGVsMxv_exp2m1 F
 GLIBC_2.43 _ZGVsMxv_exp2m1f F
+GLIBC_2.43 _ZGVsMxv_log10p1 F
+GLIBC_2.43 _ZGVsMxv_log10p1f F
 GLIBC_2.43 _ZGVsMxv_log2p1 F
 GLIBC_2.43 _ZGVsMxv_log2p1f F