math: Improve fmod(f) performance

Optimize the fast paths (x < y) and (x/y < 2^12).  Delay handling of special
cases to reduce the number of instructions executed before the fast paths.
Performance improvements for fmod:

		Skylake	Zen2	Neoverse V1
subnormals	11.8%	4.2%	11.5%
normal		3.9%	0.01%	-0.5%
close-exponents	6.3%	5.6%	19.4%

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
This commit is contained in:
Wilco Dijkstra 2023-04-17 12:42:18 +01:00
parent 2623479105
commit 76d0f094dd
2 changed files with 101 additions and 77 deletions

View File

@ -40,10 +40,10 @@
r == x % y == (x % (N * y)) % y r == x % y == (x % (N * y)) % y
And with mx/my being mantissa of double floating point number (which uses And with mx/my being mantissa of a double floating point number (which uses
less bits than the storage type), on each step the argument reduction can less bits than the storage type), on each step the argument reduction can
be improved by 11 (which is the size of uint64_t minus MANTISSA_WIDTH plus be improved by 11 (which is the size of uint64_t minus MANTISSA_WIDTH plus
the signal bit): the implicit one bit):
mx * 2^ex == 2^11 * mx * 2^(ex - 11) mx * 2^ex == 2^11 * mx * 2^(ex - 11)
@ -54,7 +54,12 @@
mx << 11; mx << 11;
ex -= 11; ex -= 11;
mx %= my; mx %= my;
} */ }
Special cases:
- If x or y is a NaN, a NaN is returned.
- If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
- If x is +0/-0, and y is not zero, +0/-0 is returned. */
double double
__fmod (double x, double y) __fmod (double x, double y)
@ -67,62 +72,70 @@ __fmod (double x, double y)
hx ^= sx; hx ^= sx;
hy &= ~SIGN_MASK; hy &= ~SIGN_MASK;
/* Special cases: /* If x < y, return x (unless y is a NaN). */
- If x or y is a Nan, NaN is returned. if (__glibc_likely (hx < hy))
- If x is an inifinity, a NaN is returned and EDOM is set.
- If y is zero, Nan is returned.
- If x is +0/-0, and y is not zero, +0/-0 is returned. */
if (__glibc_unlikely (hy == 0
|| hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
{ {
if (is_nan (hx) || is_nan (hy)) /* If y is a NaN, return a NaN. */
return (x * y) / (x * y); if (__glibc_unlikely (hy > EXPONENT_MASK))
return __math_edom ((x * y) / (x * y)); return x * y;
}
if (__glibc_unlikely (hx <= hy))
{
if (hx < hy)
return x; return x;
return asdouble (sx);
} }
int ex = hx >> MANTISSA_WIDTH; int ex = hx >> MANTISSA_WIDTH;
int ey = hy >> MANTISSA_WIDTH; int ey = hy >> MANTISSA_WIDTH;
int exp_diff = ex - ey;
/* Common case where exponents are close: ey >= -907 and |x/y| < 2^52, */ /* Common case where exponents are close: |x/y| < 2^12, x not inf/NaN
if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) and |x%y| not denormal. */
if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
&& ey > MANTISSA_WIDTH
&& exp_diff <= EXPONENT_WIDTH))
{ {
uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); uint64_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); uint64_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; mx %= (my >> exp_diff);
return make_double (d, ey - 1, sx);
if (__glibc_unlikely (mx == 0))
return asdouble (sx);
int shift = clz_uint64 (mx);
ex -= shift + 1;
mx <<= shift;
mx = sx | (mx >> EXPONENT_WIDTH);
return asdouble (mx + ((uint64_t)ex << MANTISSA_WIDTH));
} }
/* Special case, both x and y are subnormal. */ if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
if (__glibc_unlikely (ex == 0 && ey == 0)) {
/* If x is a NaN, return a NaN. */
if (hx > EXPONENT_MASK)
return x * y;
/* If x is an infinity or y is zero, return a NaN and set EDOM. */
return __math_edom ((x * y) / (x * y));
}
/* Special case, both x and y are denormal. */
if (__glibc_unlikely (ex == 0))
return asdouble (sx | hx % hy); return asdouble (sx | hx % hy);
/* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is /* Extract normalized mantissas - hx is not denormal and hy != 0. */
not subnormal by conditions above. */
uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
ex--;
uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
int lead_zeros_my = EXPONENT_WIDTH; int lead_zeros_my = EXPONENT_WIDTH;
if (__glibc_likely (ey > 0))
ey--; ey--;
else /* Special case for denormal y. */
if (__glibc_unlikely (ey < 0))
{ {
my = hy; my = hy;
ey = 0;
exp_diff--;
lead_zeros_my = clz_uint64 (my); lead_zeros_my = clz_uint64 (my);
} }
/* Assume hy != 0 */
int tail_zeros_my = ctz_uint64 (my); int tail_zeros_my = ctz_uint64 (my);
int sides_zeroes = lead_zeros_my + tail_zeros_my; int sides_zeroes = lead_zeros_my + tail_zeros_my;
int exp_diff = ex - ey;
int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
my >>= right_shift; my >>= right_shift;
@ -141,8 +154,7 @@ __fmod (double x, double y)
if (exp_diff == 0) if (exp_diff == 0)
return make_double (mx, ey, sx); return make_double (mx, ey, sx);
/* Assume modulo/divide operation is slow, so use multiplication with invert /* Multiplication with the inverse is faster than repeated modulo. */
values. */
uint64_t inv_hy = UINT64_MAX / my; uint64_t inv_hy = UINT64_MAX / my;
while (exp_diff > sides_zeroes) { while (exp_diff > sides_zeroes) {
exp_diff -= sides_zeroes; exp_diff -= sides_zeroes;

View File

@ -40,10 +40,10 @@
r == x % y == (x % (N * y)) % y r == x % y == (x % (N * y)) % y
And with mx/my being mantissa of double floating point number (which uses And with mx/my being mantissa of a single floating point number (which uses
less bits than the storage type), on each step the argument reduction can less bits than the storage type), on each step the argument reduction can
be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus
the signal bit): the implicit one bit):
mx * 2^ex == 2^8 * mx * 2^(ex - 8) mx * 2^ex == 2^8 * mx * 2^(ex - 8)
@ -54,7 +54,12 @@
mx << 8; mx << 8;
ex -= 8; ex -= 8;
mx %= my; mx %= my;
} */ }
Special cases:
- If x or y is a NaN, a NaN is returned.
- If x is an infinity, or y is zero, a NaN is returned and EDOM is set.
- If x is +0/-0, and y is not zero, +0/-0 is returned. */
float float
__fmodf (float x, float y) __fmodf (float x, float y)
@ -67,61 +72,69 @@ __fmodf (float x, float y)
hx ^= sx; hx ^= sx;
hy &= ~SIGN_MASK; hy &= ~SIGN_MASK;
/* Special cases: if (__glibc_likely (hx < hy))
- If x or y is a Nan, NaN is returned.
- If x is an inifinity, a NaN is returned.
- If y is zero, Nan is returned.
- If x is +0/-0, and y is not zero, +0/-0 is returned. */
if (__glibc_unlikely (hy == 0
|| hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
{ {
if (is_nan (hx) || is_nan (hy)) /* If y is a NaN, return a NaN. */
return (x * y) / (x * y); if (__glibc_unlikely (hy > EXPONENT_MASK))
return __math_edomf ((x * y) / (x * y)); return x * y;
}
if (__glibc_unlikely (hx <= hy))
{
if (hx < hy)
return x; return x;
return asfloat (sx);
} }
int ex = hx >> MANTISSA_WIDTH; int ex = hx >> MANTISSA_WIDTH;
int ey = hy >> MANTISSA_WIDTH; int ey = hy >> MANTISSA_WIDTH;
int exp_diff = ex - ey;
/* Common case where exponents are close: ey >= -103 and |x/y| < 2^8, */ /* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN
if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) and |x%y| not denormal. */
if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH
&& ey > MANTISSA_WIDTH
&& exp_diff <= EXPONENT_WIDTH))
{ {
uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK;
uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK;
uint32_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; mx %= (my >> exp_diff);
return make_float (d, ey - 1, sx);
if (__glibc_unlikely (mx == 0))
return asfloat (sx);
int shift = __builtin_clz (mx);
ex -= shift + 1;
mx <<= shift;
mx = sx | (mx >> EXPONENT_WIDTH);
return asfloat (mx + ((uint32_t)ex << MANTISSA_WIDTH));
} }
/* Special case, both x and y are subnormal. */ if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK))
if (__glibc_unlikely (ex == 0 && ey == 0)) {
/* If x is a NaN, return a NaN. */
if (hx > EXPONENT_MASK)
return x * y;
/* If x is an infinity or y is zero, return a NaN and set EDOM. */
return __math_edomf ((x * y) / (x * y));
}
/* Special case, both x and y are denormal. */
if (__glibc_unlikely (ex == 0))
return asfloat (sx | hx % hy); return asfloat (sx | hx % hy);
/* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is /* Extract normalized mantissas - hx is not denormal and hy != 0. */
not subnormal by conditions above. */
uint32_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); uint32_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
ex--;
uint32_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); uint32_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
int lead_zeros_my = EXPONENT_WIDTH; int lead_zeros_my = EXPONENT_WIDTH;
if (__glibc_likely (ey > 0))
ey--; ey--;
else /* Special case for denormal y. */
if (__glibc_unlikely (ey < 0))
{ {
my = hy; my = hy;
ey = 0;
exp_diff--;
lead_zeros_my = __builtin_clz (my); lead_zeros_my = __builtin_clz (my);
} }
int tail_zeros_my = __builtin_ctz (my); int tail_zeros_my = __builtin_ctz (my);
int sides_zeroes = lead_zeros_my + tail_zeros_my; int sides_zeroes = lead_zeros_my + tail_zeros_my;
int exp_diff = ex - ey;
int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
my >>= right_shift; my >>= right_shift;
@ -140,8 +153,7 @@ __fmodf (float x, float y)
if (exp_diff == 0) if (exp_diff == 0)
return make_float (mx, ey, sx); return make_float (mx, ey, sx);
/* Assume modulo/divide operation is slow, so use multiplication with invert /* Multiplication with the inverse is faster than repeated modulo. */
values. */
uint32_t inv_hy = UINT32_MAX / my; uint32_t inv_hy = UINT32_MAX / my;
while (exp_diff > sides_zeroes) { while (exp_diff > sides_zeroes) {
exp_diff -= sides_zeroes; exp_diff -= sides_zeroes;