diff --git a/sysdeps/aarch64/sfp-machine.h b/sysdeps/aarch64/sfp-machine.h
index b41a9462df..3468e47db3 100644
--- a/sysdeps/aarch64/sfp-machine.h
+++ b/sysdeps/aarch64/sfp-machine.h
@@ -2,9 +2,9 @@
 #include <fpu_control.h>
 
 #define _FP_W_TYPE_SIZE		64
-#define _FP_W_TYPE		unsigned long long
+#define _FP_W_TYPE		unsigned long
 #define _FP_WS_TYPE		signed long long
-#define _FP_I_TYPE		long long
+#define _FP_I_TYPE		long
 
 #define _FP_MUL_MEAT_S(R,X,Y)					\
   _FP_MUL_MEAT_1_imm(_FP_WFRACBITS_S,R,X,Y)
diff --git a/sysdeps/generic/gmp-arch.h b/sysdeps/generic/gmp-arch.h
index b093f59e20..3adab10732 100644
--- a/sysdeps/generic/gmp-arch.h
+++ b/sysdeps/generic/gmp-arch.h
@@ -21,6 +21,7 @@
 
 #include <stdint.h>
 #include <gmp.h>
+#include <math_uint128.h>
 
 #define LL_B ((mp_limb_t) 1 << (BITS_PER_MP_LIMB / 2))
 
@@ -97,4 +98,58 @@ udiv_qrnnd_generic (mp_limb_t *q, mp_limb_t *r, mp_limb_t n1, mp_limb_t n0,
   udiv_qrnnd_generic (&__q, &__r, __n1, __n0, __d)
 #endif
 
+
+/* add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+   high_addend_2, low_addend_2) adds two UWtype integers, composed by
+   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
+   (i.e. carry out) is not stored anywhere, and is lost.  */
+static __always_inline void
+add_ssaaaa_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
+		    mp_limb_t al,  mp_limb_t bh,  mp_limb_t bl)
+{
+#if __WORDSIZE == 32
+  uint64_t a = (uint64_t)ah << 32 | al;
+  uint64_t b = (uint64_t)bh << 32 | bl;
+  uint64_t r = a + b;
+  *sh = r >> 32;
+  *sl = r & 0xFFFFFFFF;
+#else
+  u128 r = u128_add (u128_from_hl (ah, al),
+                     u128_from_hl (bh, bl));
+  *sh = u128_high (r);
+  *sl = u128_low (r);
+#endif
+}
+#undef add_ssaaaa
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  add_ssaaaa_generic (&sh, &sl, ah, al, bh, bl)
+
+/* sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
+   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
+   and is lost.  */
+static __always_inline void
+sub_ddmmss_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
+		    mp_limb_t al,  mp_limb_t bh,  mp_limb_t bl)
+{
+#if __WORDSIZE == 32
+  uint64_t a = (uint64_t)ah << 32 | al;
+  uint64_t b = (uint64_t)bh << 32 | bl;
+  uint64_t r = a - b;
+  *sh = r >> 32;
+  *sl = r & 0xFFFFFFFF;
+#else
+  u128 r = u128_sub (u128_from_hl (ah, al),
+                     u128_from_hl (bh, bl));
+  *sh = u128_high (r);
+  *sl = u128_low (r);
+#endif
+}
+#undef sub_ddmmss
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  sub_ddmmss_generic (&sh, &sl, ah, al, bh, bl)
+
 #endif /* __GMP_ARCH_H */
diff --git a/sysdeps/generic/math_uint128.h b/sysdeps/generic/math_uint128.h
index 56d0fba780..fa5aeb8576 100644
--- a/sysdeps/generic/math_uint128.h
+++ b/sysdeps/generic/math_uint128.h
@@ -19,6 +19,8 @@
 #ifndef _MATH_INT128_H
 #define _MATH_INT128_H
 
+#include <stdbool.h>
+
 /* Limited support for internal 128 bit integer, used on some math
    implementations.  It uses compiler builtin type if supported, otherwise
    it is emulated.  Only unsigned and some operations are currently supported:
@@ -27,8 +29,11 @@
    - u128_high:      return the high part of the number.
    - u128_low:       return the low part of the number.
    - u128_from_u64:  create a 128 bit number from a 64 bit one.
+   - u128_from_hl:   create a 128 bit number from two 64 bit numbers.
    - u128_mul:       multiply two 128 bit numbers.
    - u128_add:       add two 128 bit numbers.
+   - u128_sub:       subtract two 128 bit numbers.
+   - u128_neg:       negate a 128 bit number.
    - u128_lshift:    left shift a number.
    - u128_rshift:    right shift a number.
  */
@@ -47,8 +52,10 @@ typedef unsigned __int128 u128;
 # define u128_high(__x)         (uint64_t)((__x) >> 64)
 # define u128_low(__x)          (uint64_t)(__x)
 # define u128_from_u64(__x)     (u128)(__x)
+# define u128_from_hl(__h, __l) (((u128)(__h) << 64) | (__l))
 # define u128_mul(__x, __y)     (__x) * (__y)
 # define u128_add(__x, __y)     (__x) + (__y)
+# define u128_sub(__x, __y)     (__x) - (__y)
 # define u128_lshift(__x, __y)  (__x) << (__y)
 # define u128_rshift(__x, __y)  (__x) >> (__y)
 #else
@@ -61,16 +68,28 @@ typedef struct
 # define u128_high(__x)         (__x).high
 # define u128_low(__x)          (__x).low
 # define u128_from_u64(__x)     (u128){.low = (__x), .high = 0}
+# define u128_from_hl(__h, __l) (u128){.low = (__l), .high = (__h)}
 
 # define MASK32                 (UINT64_C(0xffffffff))
 
-static u128 u128_add (u128 x, u128 y)
+static inline u128 u128_add (u128 x, u128 y)
 {
   bool carry = x.low + y.low < x.low;
   return (u128) { .high = x.high + y.high + carry, .low = x.low + y.low };
 }
 
-static u128 u128_lshift (u128 x, unsigned int n)
+static inline u128 u128_neg (u128 x)
+{
+  u128 xbitnot = u128_from_hl (~x.high, ~x.low);
+  return u128_add (xbitnot, u128_from_u64 (1));
+}
+
+static inline u128 u128_sub (u128 x, u128 y)
+{
+  return u128_add (x, u128_neg (y));
+}
+
+static inline u128 u128_lshift (u128 x, unsigned int n)
 {
   switch (n)
     {
@@ -82,7 +101,7 @@ static u128 u128_lshift (u128 x, unsigned int n)
     }
 }
 
-static u128 u128_rshift (u128 x, unsigned int n)
+static inline u128 u128_rshift (u128 x, unsigned int n)
 {
   switch (n)
     {
@@ -94,7 +113,7 @@ static u128 u128_rshift (u128 x, unsigned int n)
     }
 }
 
-static u128 u128_mul (u128 x, u128 y)
+static inline u128 u128_mul (u128 x, u128 y)
 {
   if (x.high == 0 && y.high == 0)
     {
diff --git a/sysdeps/loongarch/sfp-machine.h b/sysdeps/loongarch/sfp-machine.h
index 497b550f5c..113d96651b 100644
--- a/sysdeps/loongarch/sfp-machine.h
+++ b/sysdeps/loongarch/sfp-machine.h
@@ -21,9 +21,9 @@
 #include <fpu_control.h>
 
 #define _FP_W_TYPE_SIZE 64
-#define _FP_W_TYPE unsigned long long
-#define _FP_WS_TYPE signed long long
-#define _FP_I_TYPE long long
+#define _FP_W_TYPE unsigned long
+#define _FP_WS_TYPE signed long
+#define _FP_I_TYPE long
 
 #define _FP_MUL_MEAT_S(R, X, Y) _FP_MUL_MEAT_1_imm (_FP_WFRACBITS_S, R, X, Y)
 #define _FP_MUL_MEAT_D(R, X, Y) \
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/sfp-machine.h b/sysdeps/powerpc/powerpc64/le/fpu/sfp-machine.h
index b4b27f95f4..30b7fcf6ed 100644
--- a/sysdeps/powerpc/powerpc64/le/fpu/sfp-machine.h
+++ b/sysdeps/powerpc/powerpc64/le/fpu/sfp-machine.h
@@ -1,7 +1,7 @@
 #define _FP_W_TYPE_SIZE		64
-#define _FP_W_TYPE		unsigned long long
-#define _FP_WS_TYPE		signed long long
-#define _FP_I_TYPE		long long
+#define _FP_W_TYPE		unsigned long
+#define _FP_WS_TYPE		signed long
+#define _FP_I_TYPE		long
 
 typedef int TItype __attribute__ ((mode (TI)));
 typedef unsigned int UTItype __attribute__ ((mode (TI)));
diff --git a/sysdeps/riscv/sfp-machine.h b/sysdeps/riscv/sfp-machine.h
index 43b2ba10ab..a38edd1e3c 100644
--- a/sysdeps/riscv/sfp-machine.h
+++ b/sysdeps/riscv/sfp-machine.h
@@ -52,9 +52,9 @@
 #else
 
 # define _FP_W_TYPE_SIZE		64
-# define _FP_W_TYPE		unsigned long long
-# define _FP_WS_TYPE		signed long long
-# define _FP_I_TYPE		long long
+# define _FP_W_TYPE		unsigned long
+# define _FP_WS_TYPE		signed long
+# define _FP_I_TYPE		long
 
 # define _FP_MUL_MEAT_S(R, X, Y)					\
   _FP_MUL_MEAT_1_imm (_FP_WFRACBITS_S, R, X, Y)
diff --git a/sysdeps/x86/fpu/sfp-machine.h b/sysdeps/x86/fpu/sfp-machine.h
index e30cbdb20b..aae0653c83 100644
--- a/sysdeps/x86/fpu/sfp-machine.h
+++ b/sysdeps/x86/fpu/sfp-machine.h
@@ -18,9 +18,15 @@ typedef long int __gcc_CMPtype;
 
 #ifdef __x86_64__
 # define _FP_W_TYPE_SIZE	64
-# define _FP_W_TYPE		unsigned long long
-# define _FP_WS_TYPE		signed long long
-# define _FP_I_TYPE		long long
+# ifndef __ILP32__
+#  define _FP_W_TYPE		unsigned long
+#  define _FP_WS_TYPE		signed long
+#  define _FP_I_TYPE		long
+# else
+#  define _FP_W_TYPE		unsigned long long
+#  define _FP_WS_TYPE		signed long long
+#  define _FP_I_TYPE		long long
+# endif
 
 typedef int TItype __attribute__ ((mode (TI)));
 typedef unsigned int UTItype __attribute__ ((mode (TI)));
@@ -55,9 +61,9 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
   } while (0)
 #else
 # define _FP_W_TYPE_SIZE	32
-# define _FP_W_TYPE		unsigned int
-# define _FP_WS_TYPE		signed int
-# define _FP_I_TYPE		int
+# define _FP_W_TYPE		unsigned long int
+# define _FP_WS_TYPE		signed long int
+# define _FP_I_TYPE		long int
 
 # define __FP_FRAC_ADD_4(r3,r2,r1,r0,x3,x2,x1,x0,y3,y2,y1,y0)	\
   __asm__ ("add{l} {%11,%3|%3,%11}\n\t"				\