Merge commits '44916ae9 86e3b38a ddf2b291 6138d73b e40fd277 ' into temp-merge-1156

2023-07-17 14:01:52 +00:00 · 2023-07-17 14:01:52 +00:00 · e996d076da
commit e996d076da
parent 64717a7b16 e40fd277b7
21 changed files with 1225 additions and 342 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -74,6 +74,7 @@ task:
    - env: {WIDEMUL:  int64,  RECOVERY: yes}
    - env: {WIDEMUL:  int64,                 ECDH: yes, SCHNORRSIG: yes, EXPERIMENTAL: yes, ECDSA_S2C: yes,  RANGEPROOF: yes, WHITELIST: yes, GENERATOR: yes, MUSIG: yes, ECDSAADAPTOR: yes, BPPP: yes}
    - env: {WIDEMUL: int128}
+    - env: {WIDEMUL: int128_struct}
    - env: {WIDEMUL: int128,  RECOVERY: yes,            SCHNORRSIG: yes}
    - env: {WIDEMUL: int128,                 ECDH: yes, SCHNORRSIG: yes, EXPERIMENTAL: yes, ECDSA_S2C: yes, RANGEPROOF: yes, WHITELIST: yes, GENERATOR: yes, MUSIG: yes, ECDSAADAPTOR: yes, BPPP: yes}
    - env: {WIDEMUL: int128,  ASM: x86_64}
@ -268,20 +269,26 @@ task:
    ECDSAADAPTOR: yes
    BPPP: yes
    CTIMETEST: no
+    # Use a MinGW-w64 host to tell ./configure we're building for Windows.
+    # This will detect some MinGW-w64 tools but then make will need only
+    # the MSVC tools CC, AR and NM as specified below.
+    HOST: x86_64-w64-mingw32
+    CC: /opt/msvc/bin/x64/cl
+    AR: /opt/msvc/bin/x64/lib
+    NM: /opt/msvc/bin/x64/dumpbin -symbols -headers
    # Set non-essential options that affect the CLI messages here.
    # (They depend on the user's taste, so we don't want to set them automatically in configure.ac.)
    CFLAGS: -nologo -diagnostics:caret
    LDFLAGS: -XCClinker -nologo -XCClinker -diagnostics:caret
-  # Use a MinGW-w64 host to tell ./configure we're building for Windows.
-  # This will detect some MinGW-w64 tools but then make will need only
-  # the MSVC tools CC, AR and NM as specified below.
  matrix:
    - name: "x86_64 (MSVC): Windows (Debian stable, Wine)"
+    - name: "x86_64 (MSVC): Windows (Debian stable, Wine, int128_struct)"
      env:
-        HOST: x86_64-w64-mingw32
-        CC: /opt/msvc/bin/x64/cl
-        AR: /opt/msvc/bin/x64/lib
-        NM: /opt/msvc/bin/x64/dumpbin -symbols -headers
+        WIDEMUL: int128_struct
+    - name: "x86_64 (MSVC): Windows (Debian stable, Wine, int128_struct with __(u)mulh)"
+      env:
+        WIDEMUL: int128_struct
+        CPPFLAGS: -DSECP256K1_MSVC_MULH_TEST_OVERRIDE
    - name: "i686 (MSVC): Windows (Debian stable, Wine)"
      env:
        HOST: i686-w64-mingw32
@ -346,6 +353,40 @@ task:
    - ./ci/cirrus.sh
  << : *CAT_LOGS

+# Memory sanitizers
+task:
+  << : *LINUX_CONTAINER
+  name: "MSan"
+  env:
+    ECDH: yes
+    RECOVERY: yes
+    SCHNORRSIG: yes
+    EXPERIMENTAL: yes
+    ECDSA_S2C: yes
+    GENERATOR: yes
+    RANGEPROOF: yes
+    WHITELIST: yes
+    MUSIG: yes
+    ECDSAADAPTOR: yes
+    BPPP: yes
+    CTIMETEST: no
+    CC: clang
+    SECP256K1_TEST_ITERS: 32
+    ASM: no
+  container:
+    memory: 2G
+  matrix:
+    - env:
+        CFLAGS: "-fsanitize=memory -g"
+    - env:
+        ECMULTGENPRECISION: 2
+        ECMULTWINDOW: 2
+        CFLAGS: "-fsanitize=memory -g -O3"
+  << : *MERGE_BASE
+  test_script:
+    - ./ci/cirrus.sh
+  << : *CAT_LOGS
+
 task:
  name: "C++ -fpermissive (entire project)"
  << : *LINUX_CONTAINER
--- a/Makefile.am
+++ b/Makefile.am
@ -50,6 +50,12 @@ noinst_HEADERS += src/precomputed_ecmult.h
 noinst_HEADERS += src/precomputed_ecmult_gen.h
 noinst_HEADERS += src/assumptions.h
 noinst_HEADERS += src/util.h
+noinst_HEADERS += src/int128.h
+noinst_HEADERS += src/int128_impl.h
+noinst_HEADERS += src/int128_native.h
+noinst_HEADERS += src/int128_native_impl.h
+noinst_HEADERS += src/int128_struct.h
+noinst_HEADERS += src/int128_struct_impl.h
 noinst_HEADERS += src/scratch.h
 noinst_HEADERS += src/scratch_impl.h
 noinst_HEADERS += src/selftest.h
--- a/ci/cirrus.sh
+++ b/ci/cirrus.sh
@ -5,6 +5,27 @@ set -x

 export LC_ALL=C

+# Print relevant CI environment to allow reproducing the job outside of CI.
+print_environment() {
+    # Turn off -x because it messes up the output
+    set +x
+    # There are many ways to print variable names and their content. This one
+    # does not rely on bash.
+    for i in WERROR_CFLAGS MAKEFLAGS BUILD \
+            ECMULTWINDOW ECMULTGENPRECISION ASM WIDEMUL WITH_VALGRIND EXTRAFLAGS \
+            EXPERIMENTAL ECDH RECOVERY SCHNORRSIG \
+            ECDSA_S2C GENERATOR RANGEPROOF WHITELIST MUSIG ECDSAADAPTOR BPPP \
+            SECP256K1_TEST_ITERS BENCH SECP256K1_BENCH_ITERS CTIMETEST\
+            EXAMPLES \
+            WRAPPER_CMD CC AR NM HOST
+    do
+        eval 'printf "%s %s " "$i=\"${'"$i"'}\""'
+    done
+    echo "$0"
+    set -x
+}
+print_environment
+
 # Start persistent wineserver if necessary.
 # This speeds up jobs with many invocations of wine (e.g., ./configure with MSVC) tremendously.
 case "$WRAPPER_CMD" in
--- a/configure.ac
+++ b/configure.ac
@ -220,7 +220,11 @@ AC_ARG_ENABLE(reduced_surjection_proof_size,
    [SECP_SET_DEFAULT([use_reduced_surjection_proof_size], [no], [no])])

 # Test-only override of the (autodetected by the C code) "widemul" setting.
-# Legal values are int64 (for [u]int64_t), int128 (for [unsigned] __int128), and auto (the default).
+# Legal values are:
+#  * int64 (for [u]int64_t),
+#  * int128 (for [unsigned] __int128),
+#  * int128_struct (for int128 implemented as a structure),
+#  *  and auto (the default).
 AC_ARG_WITH([test-override-wide-multiply], [] ,[set_widemul=$withval], [set_widemul=auto])

 AC_ARG_WITH([asm], [AS_HELP_STRING([--with-asm=x86_64|arm|no|auto],
@ -342,6 +346,9 @@ fi

 # Select wide multiplication implementation
 case $set_widemul in
+int128_struct)
+  AC_DEFINE(USE_FORCE_WIDEMUL_INT128_STRUCT, 1, [Define this symbol to force the use of the structure for simulating (unsigned) int128 based wide multiplication])
+  ;;
 int128)
  AC_DEFINE(USE_FORCE_WIDEMUL_INT128, 1, [Define this symbol to force the use of the (unsigned) __int128 based wide multiplication implementation])
  ;;
--- a/src/assumptions.h
+++ b/src/assumptions.h
@ -10,6 +10,9 @@
 #include <limits.h>

 #include "util.h"
+#if defined(SECP256K1_INT128_NATIVE)
+#include "int128_native.h"
+#endif

 /* This library, like most software, relies on a number of compiler implementation defined (but not undefined)
   behaviours. Although the behaviours we require are essentially universal we test them specifically here to
@ -55,7 +58,7 @@ struct secp256k1_assumption_checker {

        /* To int64_t. */
        ((int64_t)(uint64_t)0xB123C456D789E012ULL == (int64_t)-(int64_t)0x4EDC3BA928761FEEULL) &&
-#if defined(SECP256K1_WIDEMUL_INT128)
+#if defined(SECP256K1_INT128_NATIVE)
        ((int64_t)(((uint128_t)0xA1234567B8901234ULL << 64) + 0xC5678901D2345678ULL) == (int64_t)-(int64_t)0x3A9876FE2DCBA988ULL) &&
        (((int64_t)(int128_t)(((uint128_t)0xB1C2D3E4F5A6B7C8ULL << 64) + 0xD9E0F1A2B3C4D5E6ULL)) == (int64_t)(uint64_t)0xD9E0F1A2B3C4D5E6ULL) &&
        (((int64_t)(int128_t)(((uint128_t)0xABCDEF0123456789ULL << 64) + 0x0123456789ABCDEFULL)) == (int64_t)(uint64_t)0x0123456789ABCDEFULL) &&
@ -71,7 +74,7 @@ struct secp256k1_assumption_checker {
        ((((int16_t)0xE9AC) >> 4) == (int16_t)(uint16_t)0xFE9A) &&
        ((((int32_t)0x937C918A) >> 9) == (int32_t)(uint32_t)0xFFC9BE48) &&
        ((((int64_t)0xA8B72231DF9CF4B9ULL) >> 19) == (int64_t)(uint64_t)0xFFFFF516E4463BF3ULL) &&
-#if defined(SECP256K1_WIDEMUL_INT128)
+#if defined(SECP256K1_INT128_NATIVE)
        ((((int128_t)(((uint128_t)0xCD833A65684A0DBCULL << 64) + 0xB349312F71EA7637ULL)) >> 39) == (int128_t)(((uint128_t)0xFFFFFFFFFF9B0674ULL << 64) + 0xCAD0941B79669262ULL)) &&
 #endif
    1) * 2 - 1];
--- a/src/bench_whitelist.c
+++ b/src/bench_whitelist.c
@ -11,6 +11,7 @@
 #include "util.h"
 #include "bench.h"
 #include "hash_impl.h"
+#include "int128_impl.h"
 #include "scalar_impl.h"
 #include "testrand_impl.h"

--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@ -200,9 +200,15 @@ static int secp256k1_ecmult_wnaf(int *wnaf, int len, const secp256k1_scalar *a,
        bit += now;
    }
 #ifdef VERIFY
-    CHECK(carry == 0);
-    while (bit < 256) {
-        CHECK(secp256k1_scalar_get_bits(&s, bit++, 1) == 0);
+    {
+        int verify_bit = bit;
+
+        VERIFY_CHECK(carry == 0);
+
+        while (verify_bit < 256) {
+            VERIFY_CHECK(secp256k1_scalar_get_bits(&s, verify_bit, 1) == 0);
+            verify_bit++;
+        }
    }
 #endif
    return last_set_bit + 1;
--- a/src/field_5x52_int128_impl.h
+++ b/src/field_5x52_int128_impl.h
@ -9,14 +9,18 @@

 #include <stdint.h>

+#include "int128.h"
+
 #ifdef VERIFY
 #define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0)
+#define VERIFY_BITS_128(x, n) VERIFY_CHECK(secp256k1_u128_check_bits((x), (n)))
 #else
 #define VERIFY_BITS(x, n) do { } while(0)
+#define VERIFY_BITS_128(x, n) do { } while(0)
 #endif

 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
-    uint128_t c, d;
+    secp256k1_uint128 c, d;
    uint64_t t3, t4, tx, u0;
    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
@ -40,121 +44,119 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    d  = (uint128_t)a0 * b[3]
-       + (uint128_t)a1 * b[2]
-       + (uint128_t)a2 * b[1]
-       + (uint128_t)a3 * b[0];
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_mul(&d, a0, b[3]);
+    secp256k1_u128_accum_mul(&d, a1, b[2]);
+    secp256k1_u128_accum_mul(&d, a2, b[1]);
+    secp256k1_u128_accum_mul(&d, a3, b[0]);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (uint128_t)a4 * b[4];
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a4, b[4]);
+    VERIFY_BITS_128(&c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    d += (uint128_t)R * (uint64_t)c; c >>= 64;
-    VERIFY_BITS(d, 115);
-    VERIFY_BITS(c, 48);
+    secp256k1_u128_accum_mul(&d, R, secp256k1_u128_to_u64(&c)); secp256k1_u128_rshift(&c, 64);
+    VERIFY_BITS_128(&d, 115);
+    VERIFY_BITS_128(&c, 48);
    /* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    t3 = d & M; d >>= 52;
+    t3 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t3, 52);
-    VERIFY_BITS(d, 63);
+    VERIFY_BITS_128(&d, 63);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

-    d += (uint128_t)a0 * b[4]
-       + (uint128_t)a1 * b[3]
-       + (uint128_t)a2 * b[2]
-       + (uint128_t)a3 * b[1]
-       + (uint128_t)a4 * b[0];
-    VERIFY_BITS(d, 115);
+    secp256k1_u128_accum_mul(&d, a0, b[4]);
+    secp256k1_u128_accum_mul(&d, a1, b[3]);
+    secp256k1_u128_accum_mul(&d, a2, b[2]);
+    secp256k1_u128_accum_mul(&d, a3, b[1]);
+    secp256k1_u128_accum_mul(&d, a4, b[0]);
+    VERIFY_BITS_128(&d, 115);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    d += (uint128_t)(R << 12) * (uint64_t)c;
-    VERIFY_BITS(d, 116);
+    secp256k1_u128_accum_mul(&d, R << 12, secp256k1_u128_to_u64(&c));
+    VERIFY_BITS_128(&d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    t4 = d & M; d >>= 52;
+    t4 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t4, 52);
-    VERIFY_BITS(d, 64);
+    VERIFY_BITS_128(&d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (uint128_t)a0 * b[0];
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a0, b[0]);
+    VERIFY_BITS_128(&c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (uint128_t)a1 * b[4]
-       + (uint128_t)a2 * b[3]
-       + (uint128_t)a3 * b[2]
-       + (uint128_t)a4 * b[1];
-    VERIFY_BITS(d, 115);
+    secp256k1_u128_accum_mul(&d, a1, b[4]);
+    secp256k1_u128_accum_mul(&d, a2, b[3]);
+    secp256k1_u128_accum_mul(&d, a3, b[2]);
+    secp256k1_u128_accum_mul(&d, a4, b[1]);
+    VERIFY_BITS_128(&d, 115);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    u0 = d & M; d >>= 52;
+    u0 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(u0, 52);
-    VERIFY_BITS(d, 63);
+    VERIFY_BITS_128(&d, 63);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (uint128_t)u0 * (R >> 4);
-    VERIFY_BITS(c, 115);
+    secp256k1_u128_accum_mul(&c, u0, R >> 4);
+    VERIFY_BITS_128(&c, 115);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    r[0] = c & M; c >>= 52;
+    r[0] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[0], 52);
-    VERIFY_BITS(c, 61);
+    VERIFY_BITS_128(&c, 61);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

-    c += (uint128_t)a0 * b[1]
-       + (uint128_t)a1 * b[0];
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, b[1]);
+    secp256k1_u128_accum_mul(&c, a1, b[0]);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (uint128_t)a2 * b[4]
-       + (uint128_t)a3 * b[3]
-       + (uint128_t)a4 * b[2];
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a2, b[4]);
+    secp256k1_u128_accum_mul(&d, a3, b[3]);
+    secp256k1_u128_accum_mul(&d, a4, b[2]);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    c += (d & M) * R; d >>= 52;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 62);
+    secp256k1_u128_accum_mul(&c, secp256k1_u128_to_u64(&d) & M, R); secp256k1_u128_rshift(&d, 52);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 62);
    /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    r[1] = c & M; c >>= 52;
+    r[1] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[1], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (uint128_t)a0 * b[2]
-       + (uint128_t)a1 * b[1]
-       + (uint128_t)a2 * b[0];
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, b[2]);
+    secp256k1_u128_accum_mul(&c, a1, b[1]);
+    secp256k1_u128_accum_mul(&c, a2, b[0]);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (uint128_t)a3 * b[4]
-       + (uint128_t)a4 * b[3];
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a3, b[4]);
+    secp256k1_u128_accum_mul(&d, a4, b[3]);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c += (uint128_t)R * (uint64_t)d; d >>= 64;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 50);
+    secp256k1_u128_accum_mul(&c, R, secp256k1_u128_to_u64(&d)); secp256k1_u128_rshift(&d, 64);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 50);
    /* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */

-    r[2] = c & M; c >>= 52;
+    r[2] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[2], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c   += (uint128_t)(R << 12) * (uint64_t)d + t3;
-    VERIFY_BITS(c, 100);
+    secp256k1_u128_accum_mul(&c, R << 12, secp256k1_u128_to_u64(&d));
+    secp256k1_u128_accum_u64(&c, t3);
+    VERIFY_BITS_128(&c, 100);
    /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[3] = c & M; c >>= 52;
+    r[3] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[3], 52);
-    VERIFY_BITS(c, 48);
+    VERIFY_BITS_128(&c, 48);
    /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c   += t4;
-    VERIFY_BITS(c, 49);
-    /* [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[4] = c;
+    r[4] = secp256k1_u128_to_u64(&c) + t4;
    VERIFY_BITS(r[4], 49);
    /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }

 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
-    uint128_t c, d;
+    secp256k1_uint128 c, d;
    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
    int64_t t3, t4, tx, u0;
    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
@ -170,107 +172,105 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    d  = (uint128_t)(a0*2) * a3
-       + (uint128_t)(a1*2) * a2;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_mul(&d, a0*2, a3);
+    secp256k1_u128_accum_mul(&d, a1*2, a2);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (uint128_t)a4 * a4;
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a4, a4);
+    VERIFY_BITS_128(&c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    d += (uint128_t)R * (uint64_t)c; c >>= 64;
-    VERIFY_BITS(d, 115);
-    VERIFY_BITS(c, 48);
+    secp256k1_u128_accum_mul(&d, R, secp256k1_u128_to_u64(&c)); secp256k1_u128_rshift(&c, 64);
+    VERIFY_BITS_128(&d, 115);
+    VERIFY_BITS_128(&c, 48);
    /* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    t3 = d & M; d >>= 52;
+    t3 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t3, 52);
-    VERIFY_BITS(d, 63);
+    VERIFY_BITS_128(&d, 63);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

    a4 *= 2;
-    d += (uint128_t)a0 * a4
-       + (uint128_t)(a1*2) * a3
-       + (uint128_t)a2 * a2;
-    VERIFY_BITS(d, 115);
+    secp256k1_u128_accum_mul(&d, a0, a4);
+    secp256k1_u128_accum_mul(&d, a1*2, a3);
+    secp256k1_u128_accum_mul(&d, a2, a2);
+    VERIFY_BITS_128(&d, 115);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    d += (uint128_t)(R << 12) * (uint64_t)c;
-    VERIFY_BITS(d, 116);
+    secp256k1_u128_accum_mul(&d, R << 12, secp256k1_u128_to_u64(&c));
+    VERIFY_BITS_128(&d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    t4 = d & M; d >>= 52;
+    t4 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t4, 52);
-    VERIFY_BITS(d, 64);
+    VERIFY_BITS_128(&d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (uint128_t)a0 * a0;
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a0, a0);
+    VERIFY_BITS_128(&c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (uint128_t)a1 * a4
-       + (uint128_t)(a2*2) * a3;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a1, a4);
+    secp256k1_u128_accum_mul(&d, a2*2, a3);
+    VERIFY_BITS_128(&d, 114);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    u0 = d & M; d >>= 52;
+    u0 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(u0, 52);
-    VERIFY_BITS(d, 62);
+    VERIFY_BITS_128(&d, 62);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (uint128_t)u0 * (R >> 4);
-    VERIFY_BITS(c, 113);
+    secp256k1_u128_accum_mul(&c, u0, R >> 4);
+    VERIFY_BITS_128(&c, 113);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    r[0] = c & M; c >>= 52;
+    r[0] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[0], 52);
-    VERIFY_BITS(c, 61);
+    VERIFY_BITS_128(&c, 61);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

    a0 *= 2;
-    c += (uint128_t)a0 * a1;
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, a1);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (uint128_t)a2 * a4
-       + (uint128_t)a3 * a3;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a2, a4);
+    secp256k1_u128_accum_mul(&d, a3, a3);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    c += (d & M) * R; d >>= 52;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 62);
+    secp256k1_u128_accum_mul(&c, secp256k1_u128_to_u64(&d) & M, R); secp256k1_u128_rshift(&d, 52);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 62);
    /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    r[1] = c & M; c >>= 52;
+    r[1] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[1], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (uint128_t)a0 * a2
-       + (uint128_t)a1 * a1;
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, a2);
+    secp256k1_u128_accum_mul(&c, a1, a1);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (uint128_t)a3 * a4;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a3, a4);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c += (uint128_t)R * (uint64_t)d; d >>= 64;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 50);
+    secp256k1_u128_accum_mul(&c, R, secp256k1_u128_to_u64(&d)); secp256k1_u128_rshift(&d, 64);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 50);
    /* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[2] = c & M; c >>= 52;
+    r[2] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[2], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */

-    c   += (uint128_t)(R << 12) * (uint64_t)d + t3;
-    VERIFY_BITS(c, 100);
+    secp256k1_u128_accum_mul(&c, R << 12, secp256k1_u128_to_u64(&d));
+    secp256k1_u128_accum_u64(&c, t3);
+    VERIFY_BITS_128(&c, 100);
    /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[3] = c & M; c >>= 52;
+    r[3] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[3], 52);
-    VERIFY_BITS(c, 48);
+    VERIFY_BITS_128(&c, 48);
    /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c   += t4;
-    VERIFY_BITS(c, 49);
-    /* [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[4] = c;
+    r[4] = secp256k1_u128_to_u64(&c) + t4;
    VERIFY_BITS(r[4], 49);
    /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }
--- a/src/int128.h
+++ b/src/int128.h
@ -0,0 +1,85 @@
+#ifndef SECP256K1_INT128_H
+#define SECP256K1_INT128_H
+
+#include "util.h"
+
+#if defined(SECP256K1_WIDEMUL_INT128)
+#  if defined(SECP256K1_INT128_NATIVE)
+#    include "int128_native.h"
+#  elif defined(SECP256K1_INT128_STRUCT)
+#    include "int128_struct.h"
+#  else
+#    error "Please select int128 implementation"
+#  endif
+
+/* Construct an unsigned 128-bit value from a high and a low 64-bit value. */
+static SECP256K1_INLINE void secp256k1_u128_load(secp256k1_uint128 *r, uint64_t hi, uint64_t lo);
+
+/* Multiply two unsigned 64-bit values a and b and write the result to r. */
+static SECP256K1_INLINE void secp256k1_u128_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b);
+
+/* Multiply two unsigned 64-bit values a and b and add the result to r.
+ * The final result is taken modulo 2^128.
+ */
+static SECP256K1_INLINE void secp256k1_u128_accum_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b);
+
+/* Add an unsigned 64-bit value a to r.
+ * The final result is taken modulo 2^128.
+ */
+static SECP256K1_INLINE void secp256k1_u128_accum_u64(secp256k1_uint128 *r, uint64_t a);
+
+/* Unsigned (logical) right shift.
+ * Non-constant time in n.
+ */
+static SECP256K1_INLINE void secp256k1_u128_rshift(secp256k1_uint128 *r, unsigned int n);
+
+/* Return the low 64-bits of a 128-bit value as an unsigned 64-bit value. */
+static SECP256K1_INLINE uint64_t secp256k1_u128_to_u64(const secp256k1_uint128 *a);
+
+/* Return the high 64-bits of a 128-bit value as an unsigned 64-bit value. */
+static SECP256K1_INLINE uint64_t secp256k1_u128_hi_u64(const secp256k1_uint128 *a);
+
+/* Write an unsigned 64-bit value to r. */
+static SECP256K1_INLINE void secp256k1_u128_from_u64(secp256k1_uint128 *r, uint64_t a);
+
+/* Tests if r is strictly less than to 2^n.
+ * n must be strictly less than 128.
+ */
+static SECP256K1_INLINE int secp256k1_u128_check_bits(const secp256k1_uint128 *r, unsigned int n);
+
+/* Construct an signed 128-bit value from a high and a low 64-bit value. */
+static SECP256K1_INLINE void secp256k1_i128_load(secp256k1_int128 *r, int64_t hi, uint64_t lo);
+
+/* Multiply two signed 64-bit values a and b and write the result to r. */
+static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b);
+
+/* Multiply two signed 64-bit values a and b and add the result to r.
+ * Overflow or underflow from the addition is undefined behaviour.
+ */
+static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b);
+
+/* Compute a*d - b*c from signed 64-bit values and write the result to r. */
+static SECP256K1_INLINE void secp256k1_i128_det(secp256k1_int128 *r, int64_t a, int64_t b, int64_t c, int64_t d);
+
+/* Signed (arithmetic) right shift.
+ * Non-constant time in b.
+ */
+static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int b);
+
+/* Return the low 64-bits of a 128-bit value interpreted as an signed 64-bit value. */
+static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a);
+
+/* Write a signed 64-bit value to r. */
+static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a);
+
+/* Compare two 128-bit values for equality. */
+static SECP256K1_INLINE int secp256k1_i128_eq_var(const secp256k1_int128 *a, const secp256k1_int128 *b);
+
+/* Tests if r is equal to 2^n.
+ * n must be strictly less than 127.
+ */
+static SECP256K1_INLINE int secp256k1_i128_check_pow2(const secp256k1_int128 *r, unsigned int n);
+
+#endif
+
+#endif
--- a/src/int128_impl.h
+++ b/src/int128_impl.h
@ -0,0 +1,18 @@
+#ifndef SECP256K1_INT128_IMPL_H
+#define SECP256K1_INT128_IMPL_H
+
+#include "util.h"
+
+#include "int128.h"
+
+#if defined(SECP256K1_WIDEMUL_INT128)
+#  if defined(SECP256K1_INT128_NATIVE)
+#    include "int128_native_impl.h"
+#  elif defined(SECP256K1_INT128_STRUCT)
+#    include "int128_struct_impl.h"
+#  else
+#    error "Please select int128 implementation"
+#  endif
+#endif
+
+#endif
--- a/src/int128_native.h
+++ b/src/int128_native.h
@ -0,0 +1,19 @@
+#ifndef SECP256K1_INT128_NATIVE_H
+#define SECP256K1_INT128_NATIVE_H
+
+#include <stdint.h>
+#include "util.h"
+
+#if !defined(UINT128_MAX) && defined(__SIZEOF_INT128__)
+SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
+SECP256K1_GNUC_EXT typedef __int128 int128_t;
+# define UINT128_MAX ((uint128_t)(-1))
+# define INT128_MAX ((int128_t)(UINT128_MAX >> 1))
+# define INT128_MIN (-INT128_MAX - 1)
+/* No (U)INT128_C macros because compilers providing __int128 do not support 128-bit literals.  */
+#endif
+
+typedef uint128_t secp256k1_uint128;
+typedef int128_t secp256k1_int128;
+
+#endif
--- a/src/int128_native_impl.h
+++ b/src/int128_native_impl.h
@ -0,0 +1,87 @@
+#ifndef SECP256K1_INT128_NATIVE_IMPL_H
+#define SECP256K1_INT128_NATIVE_IMPL_H
+
+#include "int128.h"
+
+static SECP256K1_INLINE void secp256k1_u128_load(secp256k1_uint128 *r, uint64_t hi, uint64_t lo) {
+    *r = (((uint128_t)hi) << 64) + lo;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   *r = (uint128_t)a * b;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   *r += (uint128_t)a * b;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_u64(secp256k1_uint128 *r, uint64_t a) {
+   *r += a;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_rshift(secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   *r >>= n;
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_to_u64(const secp256k1_uint128 *a) {
+   return (uint64_t)(*a);
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_hi_u64(const secp256k1_uint128 *a) {
+   return (uint64_t)(*a >> 64);
+}
+
+static SECP256K1_INLINE void secp256k1_u128_from_u64(secp256k1_uint128 *r, uint64_t a) {
+   *r = a;
+}
+
+static SECP256K1_INLINE int secp256k1_u128_check_bits(const secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   return (*r >> n == 0);
+}
+
+static SECP256K1_INLINE void secp256k1_i128_load(secp256k1_int128 *r, int64_t hi, uint64_t lo) {
+    *r = (((uint128_t)(uint64_t)hi) << 64) + lo;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   *r = (int128_t)a * b;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int128_t ab = (int128_t)a * b;
+   VERIFY_CHECK(0 <= ab ? *r <= INT128_MAX - ab : INT128_MIN - ab <= *r);
+   *r += ab;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_det(secp256k1_int128 *r, int64_t a, int64_t b, int64_t c, int64_t d) {
+   int128_t ad = (int128_t)a * d;
+   int128_t bc = (int128_t)b * c;
+   VERIFY_CHECK(0 <= bc ? INT128_MIN + bc <= ad : ad <= INT128_MAX + bc);
+   *r = ad - bc;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   *r >>= n;
+}
+
+static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a) {
+   return *a;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a) {
+   *r = a;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_eq_var(const secp256k1_int128 *a, const secp256k1_int128 *b) {
+   return *a == *b;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_check_pow2(const secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 127);
+   return (*r == (int128_t)1 << n);
+}
+
+#endif
--- a/src/int128_struct.h
+++ b/src/int128_struct.h
@ -0,0 +1,14 @@
+#ifndef SECP256K1_INT128_STRUCT_H
+#define SECP256K1_INT128_STRUCT_H
+
+#include <stdint.h>
+#include "util.h"
+
+typedef struct {
+  uint64_t lo;
+  uint64_t hi;
+} secp256k1_uint128;
+
+typedef secp256k1_uint128 secp256k1_int128;
+
+#endif
--- a/src/int128_struct_impl.h
+++ b/src/int128_struct_impl.h
@ -0,0 +1,192 @@
+#ifndef SECP256K1_INT128_STRUCT_IMPL_H
+#define SECP256K1_INT128_STRUCT_IMPL_H
+
+#include "int128.h"
+
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64)) /* MSVC */
+#    include <intrin.h>
+#    if defined(_M_ARM64) || defined(SECP256K1_MSVC_MULH_TEST_OVERRIDE)
+/* On ARM64 MSVC, use __(u)mulh for the upper half of 64x64 multiplications.
+   (Define SECP256K1_MSVC_MULH_TEST_OVERRIDE to test this code path on X64,
+   which supports both __(u)mulh and _umul128.) */
+#        if defined(SECP256K1_MSVC_MULH_TEST_OVERRIDE)
+#            pragma message(__FILE__ ": SECP256K1_MSVC_MULH_TEST_OVERRIDE is defined, forcing use of __(u)mulh.")
+#        endif
+static SECP256K1_INLINE uint64_t secp256k1_umul128(uint64_t a, uint64_t b, uint64_t* hi) {
+    *hi = __umulh(a, b);
+    return a * b;
+}
+
+static SECP256K1_INLINE int64_t secp256k1_mul128(int64_t a, int64_t b, int64_t* hi) {
+    *hi = __mulh(a, b);
+    return (uint64_t)a * (uint64_t)b;
+}
+#    else
+/* On x84_64 MSVC, use native _(u)mul128 for 64x64->128 multiplications. */
+#        define secp256k1_umul128 _umul128
+#        define secp256k1_mul128 _mul128
+#    endif
+#else
+/* On other systems, emulate 64x64->128 multiplications using 32x32->64 multiplications. */
+static SECP256K1_INLINE uint64_t secp256k1_umul128(uint64_t a, uint64_t b, uint64_t* hi) {
+    uint64_t ll = (uint64_t)(uint32_t)a * (uint32_t)b;
+    uint64_t lh = (uint32_t)a * (b >> 32);
+    uint64_t hl = (a >> 32) * (uint32_t)b;
+    uint64_t hh = (a >> 32) * (b >> 32);
+    uint64_t mid34 = (ll >> 32) + (uint32_t)lh + (uint32_t)hl;
+    *hi = hh + (lh >> 32) + (hl >> 32) + (mid34 >> 32);
+    return (mid34 << 32) + (uint32_t)ll;
+}
+
+static SECP256K1_INLINE int64_t secp256k1_mul128(int64_t a, int64_t b, int64_t* hi) {
+    uint64_t ll = (uint64_t)(uint32_t)a * (uint32_t)b;
+    int64_t lh = (uint32_t)a * (b >> 32);
+    int64_t hl = (a >> 32) * (uint32_t)b;
+    int64_t hh = (a >> 32) * (b >> 32);
+    uint64_t mid34 = (ll >> 32) + (uint32_t)lh + (uint32_t)hl;
+    *hi = hh + (lh >> 32) + (hl >> 32) + (mid34 >> 32);
+    return (mid34 << 32) + (uint32_t)ll;
+}
+#endif
+
+static SECP256K1_INLINE void secp256k1_u128_load(secp256k1_uint128 *r, uint64_t hi, uint64_t lo) {
+    r->hi = hi;
+    r->lo = lo;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   r->lo = secp256k1_umul128(a, b, &r->hi);
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   uint64_t lo, hi;
+   lo = secp256k1_umul128(a, b, &hi);
+   r->lo += lo;
+   r->hi += hi + (r->lo < lo);
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_u64(secp256k1_uint128 *r, uint64_t a) {
+   r->lo += a;
+   r->hi += r->lo < a;
+}
+
+/* Unsigned (logical) right shift.
+ * Non-constant time in n.
+ */
+static SECP256K1_INLINE void secp256k1_u128_rshift(secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   if (n >= 64) {
+     r->lo = r->hi >> (n-64);
+     r->hi = 0;
+   } else if (n > 0) {
+     r->lo = ((1U * r->hi) << (64-n)) | r->lo >> n;
+     r->hi >>= n;
+   }
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_to_u64(const secp256k1_uint128 *a) {
+   return a->lo;
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_hi_u64(const secp256k1_uint128 *a) {
+   return a->hi;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_from_u64(secp256k1_uint128 *r, uint64_t a) {
+   r->hi = 0;
+   r->lo = a;
+}
+
+static SECP256K1_INLINE int secp256k1_u128_check_bits(const secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   return n >= 64 ? r->hi >> (n - 64) == 0
+                  : r->hi == 0 && r->lo >> n == 0;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_load(secp256k1_int128 *r, int64_t hi, uint64_t lo) {
+    r->hi = hi;
+    r->lo = lo;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int64_t hi;
+   r->lo = (uint64_t)secp256k1_mul128(a, b, &hi);
+   r->hi = (uint64_t)hi;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int64_t hi;
+   uint64_t lo = (uint64_t)secp256k1_mul128(a, b, &hi);
+   r->lo += lo;
+   hi += r->lo < lo;
+   /* Verify no overflow.
+    * If r represents a positive value (the sign bit is not set) and the value we are adding is a positive value (the sign bit is not set),
+    * then we require that the resulting value also be positive (the sign bit is not set).
+    * Note that (X <= Y) means (X implies Y) when X and Y are boolean values (i.e. 0 or 1).
+    */
+   VERIFY_CHECK((r->hi <= 0x7fffffffffffffffu && (uint64_t)hi <= 0x7fffffffffffffffu) <= (r->hi + (uint64_t)hi <= 0x7fffffffffffffffu));
+   /* Verify no underflow.
+    * If r represents a negative value (the sign bit is set) and the value we are adding is a negative value (the sign bit is set),
+    * then we require that the resulting value also be negative (the sign bit is set).
+    */
+   VERIFY_CHECK((r->hi > 0x7fffffffffffffffu && (uint64_t)hi > 0x7fffffffffffffffu) <= (r->hi + (uint64_t)hi > 0x7fffffffffffffffu));
+   r->hi += hi;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_dissip_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int64_t hi;
+   uint64_t lo = (uint64_t)secp256k1_mul128(a, b, &hi);
+   hi += r->lo < lo;
+   /* Verify no overflow.
+    * If r represents a positive value (the sign bit is not set) and the value we are subtracting is a negative value (the sign bit is set),
+    * then we require that the resulting value also be positive (the sign bit is not set).
+    */
+   VERIFY_CHECK((r->hi <= 0x7fffffffffffffffu && (uint64_t)hi > 0x7fffffffffffffffu) <= (r->hi - (uint64_t)hi <= 0x7fffffffffffffffu));
+   /* Verify no underflow.
+    * If r represents a negative value (the sign bit is set) and the value we are subtracting is a positive value (the sign sign bit is not set),
+    * then we require that the resulting value also be negative (the sign bit is set).
+    */
+   VERIFY_CHECK((r->hi > 0x7fffffffffffffffu && (uint64_t)hi <= 0x7fffffffffffffffu) <= (r->hi - (uint64_t)hi > 0x7fffffffffffffffu));
+   r->hi -= hi;
+   r->lo -= lo;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_det(secp256k1_int128 *r, int64_t a, int64_t b, int64_t c, int64_t d) {
+   secp256k1_i128_mul(r, a, d);
+   secp256k1_i128_dissip_mul(r, b, c);
+}
+
+/* Signed (arithmetic) right shift.
+ * Non-constant time in n.
+ */
+static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   if (n >= 64) {
+     r->lo = (uint64_t)((int64_t)(r->hi) >> (n-64));
+     r->hi = (uint64_t)((int64_t)(r->hi) >> 63);
+   } else if (n > 0) {
+     r->lo = ((1U * r->hi) << (64-n)) | r->lo >> n;
+     r->hi = (uint64_t)((int64_t)(r->hi) >> n);
+   }
+}
+
+static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a) {
+   return (int64_t)a->lo;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a) {
+   r->hi = (uint64_t)(a >> 63);
+   r->lo = (uint64_t)a;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_eq_var(const secp256k1_int128 *a, const secp256k1_int128 *b) {
+   return a->hi == b->hi && a->lo == b->lo;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_check_pow2(const secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 127);
+   return n >= 64 ? r->hi == (uint64_t)1 << (n - 64) && r->lo == 0
+                  : r->hi == 0 && r->lo == (uint64_t)1 << n;
+}
+
+#endif
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@ -7,10 +7,9 @@
 #ifndef SECP256K1_MODINV64_IMPL_H
 #define SECP256K1_MODINV64_IMPL_H

+#include "int128.h"
 #include "modinv64.h"

-#include "util.h"
-
 /* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
 * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
 *
@ -18,6 +17,15 @@
 * implementation for N=62, using 62-bit signed limbs represented as int64_t.
 */

+/* Data type for transition matrices (see section 3 of explanation).
+ *
+ * t = [ u  v ]
+ *     [ q  r ]
+ */
+typedef struct {
+    int64_t u, v, q, r;
+} secp256k1_modinv64_trans2x2;
+
 #ifdef VERIFY
 /* Helper function to compute the absolute value of an int64_t.
 * (we don't use abs/labs/llabs as it depends on the int sizes). */
@ -32,15 +40,17 @@ static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
 static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int alen, int64_t factor) {
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
-    int128_t c = 0;
+    secp256k1_int128 c, d;
    int i;
+    secp256k1_i128_from_i64(&c, 0);
    for (i = 0; i < 4; ++i) {
-        if (i < alen) c += (int128_t)a->v[i] * factor;
-        r->v[i] = (int64_t)c & M62; c >>= 62;
+        if (i < alen) secp256k1_i128_accum_mul(&c, a->v[i], factor);
+        r->v[i] = secp256k1_i128_to_i64(&c) & M62; secp256k1_i128_rshift(&c, 62);
    }
-    if (4 < alen) c += (int128_t)a->v[4] * factor;
-    VERIFY_CHECK(c == (int64_t)c);
-    r->v[4] = (int64_t)c;
+    if (4 < alen) secp256k1_i128_accum_mul(&c, a->v[4], factor);
+    secp256k1_i128_from_i64(&d, secp256k1_i128_to_i64(&c));
+    VERIFY_CHECK(secp256k1_i128_eq_var(&c, &d));
+    r->v[4] = secp256k1_i128_to_i64(&c);
 }

 /* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
@ -60,6 +70,13 @@ static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, i
    }
    return 0;
 }
+
+/* Check if the determinant of t is equal to 1 << n. */
+static int secp256k1_modinv64_det_check_pow2(const secp256k1_modinv64_trans2x2 *t, unsigned int n) {
+    secp256k1_int128 a;
+    secp256k1_i128_det(&a, t->u, t->v, t->q, t->r);
+    return secp256k1_i128_check_pow2(&a, n);
+}
 #endif

 /* Take as input a signed62 number in range (-2*modulus,modulus), and add a multiple of the modulus
@ -136,15 +153,6 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
 #endif
 }

-/* Data type for transition matrices (see section 3 of explanation).
- *
- * t = [ u  v ]
- *     [ q  r ]
- */
-typedef struct {
-    int64_t u, v, q, r;
-} secp256k1_modinv64_trans2x2;
-
 /* Compute the transition matrix and eta for 59 divsteps (where zeta=-(delta+1/2)).
 * Note that the transformation matrix is scaled by 2^62 and not 2^59.
 *
@ -206,13 +214,15 @@ static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_
    t->v = (int64_t)v;
    t->q = (int64_t)q;
    t->r = (int64_t)r;
+#ifdef VERIFY
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 59 of them will have determinant 2^59. Multiplying with the initial
     * 8*identity (which has determinant 2^6) means the overall outputs has determinant
     * 2^65. */
-    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 65);
+    VERIFY_CHECK(secp256k1_modinv64_det_check_pow2(t, 65));
+#endif
    return zeta;
 }

@ -289,11 +299,13 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
    t->v = (int64_t)v;
    t->q = (int64_t)q;
    t->r = (int64_t)r;
+#ifdef VERIFY
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 62 of them will have determinant 2^62. */
-    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
+    VERIFY_CHECK(secp256k1_modinv64_det_check_pow2(t, 62));
+#endif
    return eta;
 }

@ -310,7 +322,7 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
    const int64_t e0 = e->v[0], e1 = e->v[1], e2 = e->v[2], e3 = e->v[3], e4 = e->v[4];
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
    int64_t md, me, sd, se;
-    int128_t cd, ce;
+    secp256k1_int128 cd, ce;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
@ -327,54 +339,64 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
    md = (u & sd) + (v & se);
    me = (q & sd) + (r & se);
    /* Begin computing t*[d,e]. */
-    cd = (int128_t)u * d0 + (int128_t)v * e0;
-    ce = (int128_t)q * d0 + (int128_t)r * e0;
+    secp256k1_i128_mul(&cd, u, d0);
+    secp256k1_i128_accum_mul(&cd, v, e0);
+    secp256k1_i128_mul(&ce, q, d0);
+    secp256k1_i128_accum_mul(&ce, r, e0);
    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 62 zero bottom bits. */
-    md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
-    me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
+    md -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&cd) + md) & M62;
+    me -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&ce) + me) & M62;
    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
-    cd += (int128_t)modinfo->modulus.v[0] * md;
-    ce += (int128_t)modinfo->modulus.v[0] * me;
+    secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[0], md);
+    secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[0], me);
    /* Verify that the low 62 bits of the computation are indeed zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
-    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cd) & M62) == 0); secp256k1_i128_rshift(&cd, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&ce) & M62) == 0); secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
-    cd += (int128_t)u * d1 + (int128_t)v * e1;
-    ce += (int128_t)q * d1 + (int128_t)r * e1;
+    secp256k1_i128_accum_mul(&cd, u, d1);
+    secp256k1_i128_accum_mul(&cd, v, e1);
+    secp256k1_i128_accum_mul(&ce, q, d1);
+    secp256k1_i128_accum_mul(&ce, r, e1);
    if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[1] * md;
-        ce += (int128_t)modinfo->modulus.v[1] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[1], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[1], me);
    }
-    d->v[0] = (int64_t)cd & M62; cd >>= 62;
-    e->v[0] = (int64_t)ce & M62; ce >>= 62;
+    d->v[0] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[0] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
-    cd += (int128_t)u * d2 + (int128_t)v * e2;
-    ce += (int128_t)q * d2 + (int128_t)r * e2;
+    secp256k1_i128_accum_mul(&cd, u, d2);
+    secp256k1_i128_accum_mul(&cd, v, e2);
+    secp256k1_i128_accum_mul(&ce, q, d2);
+    secp256k1_i128_accum_mul(&ce, r, e2);
    if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[2] * md;
-        ce += (int128_t)modinfo->modulus.v[2] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[2], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[2], me);
    }
-    d->v[1] = (int64_t)cd & M62; cd >>= 62;
-    e->v[1] = (int64_t)ce & M62; ce >>= 62;
+    d->v[1] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[1] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
-    cd += (int128_t)u * d3 + (int128_t)v * e3;
-    ce += (int128_t)q * d3 + (int128_t)r * e3;
+    secp256k1_i128_accum_mul(&cd, u, d3);
+    secp256k1_i128_accum_mul(&cd, v, e3);
+    secp256k1_i128_accum_mul(&ce, q, d3);
+    secp256k1_i128_accum_mul(&ce, r, e3);
    if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[3] * md;
-        ce += (int128_t)modinfo->modulus.v[3] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[3], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[3], me);
    }
-    d->v[2] = (int64_t)cd & M62; cd >>= 62;
-    e->v[2] = (int64_t)ce & M62; ce >>= 62;
+    d->v[2] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[2] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */
-    cd += (int128_t)u * d4 + (int128_t)v * e4;
-    ce += (int128_t)q * d4 + (int128_t)r * e4;
-    cd += (int128_t)modinfo->modulus.v[4] * md;
-    ce += (int128_t)modinfo->modulus.v[4] * me;
-    d->v[3] = (int64_t)cd & M62; cd >>= 62;
-    e->v[3] = (int64_t)ce & M62; ce >>= 62;
+    secp256k1_i128_accum_mul(&cd, u, d4);
+    secp256k1_i128_accum_mul(&cd, v, e4);
+    secp256k1_i128_accum_mul(&ce, q, d4);
+    secp256k1_i128_accum_mul(&ce, r, e4);
+    secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[4], md);
+    secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[4], me);
+    d->v[3] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[3] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
-    d->v[4] = (int64_t)cd;
-    e->v[4] = (int64_t)ce;
+    d->v[4] = secp256k1_i128_to_i64(&cd);
+    e->v[4] = secp256k1_i128_to_i64(&ce);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
@ -392,36 +414,46 @@ static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp
    const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
    const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
-    int128_t cf, cg;
+    secp256k1_int128 cf, cg;
    /* Start computing t*[f,g]. */
-    cf = (int128_t)u * f0 + (int128_t)v * g0;
-    cg = (int128_t)q * f0 + (int128_t)r * g0;
+    secp256k1_i128_mul(&cf, u, f0);
+    secp256k1_i128_accum_mul(&cf, v, g0);
+    secp256k1_i128_mul(&cg, q, f0);
+    secp256k1_i128_accum_mul(&cg, r, g0);
    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
-    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 1 of t*[f,g], and store it as output limb 0 (= down shift). */
-    cf += (int128_t)u * f1 + (int128_t)v * g1;
-    cg += (int128_t)q * f1 + (int128_t)r * g1;
-    f->v[0] = (int64_t)cf & M62; cf >>= 62;
-    g->v[0] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f1);
+    secp256k1_i128_accum_mul(&cf, v, g1);
+    secp256k1_i128_accum_mul(&cg, q, f1);
+    secp256k1_i128_accum_mul(&cg, r, g1);
+    f->v[0] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[0] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 2 of t*[f,g], and store it as output limb 1. */
-    cf += (int128_t)u * f2 + (int128_t)v * g2;
-    cg += (int128_t)q * f2 + (int128_t)r * g2;
-    f->v[1] = (int64_t)cf & M62; cf >>= 62;
-    g->v[1] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f2);
+    secp256k1_i128_accum_mul(&cf, v, g2);
+    secp256k1_i128_accum_mul(&cg, q, f2);
+    secp256k1_i128_accum_mul(&cg, r, g2);
+    f->v[1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 3 of t*[f,g], and store it as output limb 2. */
-    cf += (int128_t)u * f3 + (int128_t)v * g3;
-    cg += (int128_t)q * f3 + (int128_t)r * g3;
-    f->v[2] = (int64_t)cf & M62; cf >>= 62;
-    g->v[2] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f3);
+    secp256k1_i128_accum_mul(&cf, v, g3);
+    secp256k1_i128_accum_mul(&cg, q, f3);
+    secp256k1_i128_accum_mul(&cg, r, g3);
+    f->v[2] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[2] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 4 of t*[f,g], and store it as output limb 3. */
-    cf += (int128_t)u * f4 + (int128_t)v * g4;
-    cg += (int128_t)q * f4 + (int128_t)r * g4;
-    f->v[3] = (int64_t)cf & M62; cf >>= 62;
-    g->v[3] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f4);
+    secp256k1_i128_accum_mul(&cf, v, g4);
+    secp256k1_i128_accum_mul(&cg, q, f4);
+    secp256k1_i128_accum_mul(&cg, r, g4);
+    f->v[3] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[3] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* What remains is limb 5 of t*[f,g]; store it as output limb 4. */
-    f->v[4] = (int64_t)cf;
-    g->v[4] = (int64_t)cg;
+    f->v[4] = secp256k1_i128_to_i64(&cf);
+    g->v[4] = secp256k1_i128_to_i64(&cg);
 }

 /* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
@ -434,30 +466,34 @@ static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_sign
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
    int64_t fi, gi;
-    int128_t cf, cg;
+    secp256k1_int128 cf, cg;
    int i;
    VERIFY_CHECK(len > 0);
    /* Start computing t*[f,g]. */
    fi = f->v[0];
    gi = g->v[0];
-    cf = (int128_t)u * fi + (int128_t)v * gi;
-    cg = (int128_t)q * fi + (int128_t)r * gi;
+    secp256k1_i128_mul(&cf, u, fi);
+    secp256k1_i128_accum_mul(&cf, v, gi);
+    secp256k1_i128_mul(&cg, q, fi);
+    secp256k1_i128_accum_mul(&cg, r, gi);
    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
-    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
     * down by 62 bits). */
    for (i = 1; i < len; ++i) {
        fi = f->v[i];
        gi = g->v[i];
-        cf += (int128_t)u * fi + (int128_t)v * gi;
-        cg += (int128_t)q * fi + (int128_t)r * gi;
-        f->v[i - 1] = (int64_t)cf & M62; cf >>= 62;
-        g->v[i - 1] = (int64_t)cg & M62; cg >>= 62;
+        secp256k1_i128_accum_mul(&cf, u, fi);
+        secp256k1_i128_accum_mul(&cf, v, gi);
+        secp256k1_i128_accum_mul(&cg, q, fi);
+        secp256k1_i128_accum_mul(&cg, r, gi);
+        f->v[i - 1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+        g->v[i - 1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    }
    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
-    f->v[len - 1] = (int64_t)cf;
-    g->v[len - 1] = (int64_t)cg;
+    f->v[len - 1] = secp256k1_i128_to_i64(&cf);
+    g->v[len - 1] = secp256k1_i128_to_i64(&cg);
 }

 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
--- a/src/precompute_ecmult.c
+++ b/src/precompute_ecmult.c
@ -14,10 +14,13 @@
 #endif

 #include "../include/secp256k1.h"
+
 #include "assumptions.h"
 #include "util.h"
+
 #include "field_impl.h"
 #include "group_impl.h"
+#include "int128_impl.h"
 #include "ecmult.h"
 #include "ecmult_compute_table_impl.h"

--- a/src/precompute_ecmult_gen.c
+++ b/src/precompute_ecmult_gen.c
@ -8,9 +8,12 @@
 #include <stdio.h>

 #include "../include/secp256k1.h"
+
 #include "assumptions.h"
 #include "util.h"
+
 #include "group.h"
+#include "int128_impl.h"
 #include "ecmult_gen.h"
 #include "ecmult_gen_compute_table_impl.h"

--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@ -7,9 +7,8 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H

-#include "scalar.h"
 #include <string.h>
-
+#include "int128.h"
 #include "modinv64_impl.h"

 /* Limbs of the secp256k1 order. */
@ -79,51 +78,62 @@ SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scal
 }

 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar *r, unsigned int overflow) {
-    uint128_t t;
+    secp256k1_uint128 t;
    VERIFY_CHECK(overflow <= 1);
-    t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0;
-    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[1] + overflow * SECP256K1_N_C_1;
-    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[2] + overflow * SECP256K1_N_C_2;
-    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint64_t)r->d[3];
-    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL;
+    secp256k1_u128_from_u64(&t, r->d[0]);
+    secp256k1_u128_accum_u64(&t, overflow * SECP256K1_N_C_0);
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[1]);
+    secp256k1_u128_accum_u64(&t, overflow * SECP256K1_N_C_1);
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[2]);
+    secp256k1_u128_accum_u64(&t, overflow * SECP256K1_N_C_2);
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[3]);
+    r->d[3] = secp256k1_u128_to_u64(&t);
    return overflow;
 }

 static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
    int overflow;
-    uint128_t t = (uint128_t)a->d[0] + b->d[0];
-    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)a->d[1] + b->d[1];
-    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)a->d[2] + b->d[2];
-    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)a->d[3] + b->d[3];
-    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    overflow = t + secp256k1_scalar_check_overflow(r);
+    secp256k1_uint128 t;
+    secp256k1_u128_from_u64(&t, a->d[0]);
+    secp256k1_u128_accum_u64(&t, b->d[0]);
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, a->d[1]);
+    secp256k1_u128_accum_u64(&t, b->d[1]);
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, a->d[2]);
+    secp256k1_u128_accum_u64(&t, b->d[2]);
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, a->d[3]);
+    secp256k1_u128_accum_u64(&t, b->d[3]);
+    r->d[3] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    overflow = secp256k1_u128_to_u64(&t) + secp256k1_scalar_check_overflow(r);
    VERIFY_CHECK(overflow == 0 || overflow == 1);
    secp256k1_scalar_reduce(r, overflow);
    return overflow;
 }

 static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) {
-    uint128_t t;
+    secp256k1_uint128 t;
    volatile int vflag = flag;
    VERIFY_CHECK(bit < 256);
    bit += ((uint32_t) vflag - 1) & 0x100;  /* forcing (bit >> 6) > 3 makes this a noop */
-    t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
-    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
-    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[2] + (((uint64_t)((bit >> 6) == 2)) << (bit & 0x3F));
-    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[3] + (((uint64_t)((bit >> 6) == 3)) << (bit & 0x3F));
-    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL;
+    secp256k1_u128_from_u64(&t, r->d[0]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[1]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[2]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 2)) << (bit & 0x3F));
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[3]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 3)) << (bit & 0x3F));
+    r->d[3] = secp256k1_u128_to_u64(&t);
 #ifdef VERIFY
-    VERIFY_CHECK((t >> 64) == 0);
-    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
+    VERIFY_CHECK(secp256k1_u128_hi_u64(&t) == 0);
 #endif
 }

@ -152,14 +162,19 @@ SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a)

 static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a) {
    uint64_t nonzero = 0xFFFFFFFFFFFFFFFFULL * (secp256k1_scalar_is_zero(a) == 0);
-    uint128_t t = (uint128_t)(~a->d[0]) + SECP256K1_N_0 + 1;
-    r->d[0] = t & nonzero; t >>= 64;
-    t += (uint128_t)(~a->d[1]) + SECP256K1_N_1;
-    r->d[1] = t & nonzero; t >>= 64;
-    t += (uint128_t)(~a->d[2]) + SECP256K1_N_2;
-    r->d[2] = t & nonzero; t >>= 64;
-    t += (uint128_t)(~a->d[3]) + SECP256K1_N_3;
-    r->d[3] = t & nonzero;
+    secp256k1_uint128 t;
+    secp256k1_u128_from_u64(&t, ~a->d[0]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_0 + 1);
+    r->d[0] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, ~a->d[1]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_1);
+    r->d[1] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, ~a->d[2]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_2);
+    r->d[2] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, ~a->d[3]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_3);
+    r->d[3] = secp256k1_u128_to_u64(&t) & nonzero;
 }

 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
@ -184,14 +199,19 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
    volatile int vflag = flag;
    uint64_t mask = -vflag;
    uint64_t nonzero = (secp256k1_scalar_is_zero(r) != 0) - 1;
-    uint128_t t = (uint128_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask);
-    r->d[0] = t & nonzero; t >>= 64;
-    t += (uint128_t)(r->d[1] ^ mask) + (SECP256K1_N_1 & mask);
-    r->d[1] = t & nonzero; t >>= 64;
-    t += (uint128_t)(r->d[2] ^ mask) + (SECP256K1_N_2 & mask);
-    r->d[2] = t & nonzero; t >>= 64;
-    t += (uint128_t)(r->d[3] ^ mask) + (SECP256K1_N_3 & mask);
-    r->d[3] = t & nonzero;
+    secp256k1_uint128 t;
+    secp256k1_u128_from_u64(&t, r->d[0] ^ mask);
+    secp256k1_u128_accum_u64(&t, (SECP256K1_N_0 + 1) & mask);
+    r->d[0] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[1] ^ mask);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_1 & mask);
+    r->d[1] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[2] ^ mask);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_2 & mask);
+    r->d[2] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[3] ^ mask);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_3 & mask);
+    r->d[3] = secp256k1_u128_to_u64(&t) & nonzero;
    return 2 * (mask == 0) - 1;
 }

@ -201,9 +221,10 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
 #define muladd(a,b) { \
    uint64_t tl, th; \
    { \
-        uint128_t t = (uint128_t)a * b; \
-        th = t >> 64;         /* at most 0xFFFFFFFFFFFFFFFE */ \
-        tl = t; \
+        secp256k1_uint128 t; \
+        secp256k1_u128_mul(&t, a, b); \
+        th = secp256k1_u128_hi_u64(&t);  /* at most 0xFFFFFFFFFFFFFFFE */ \
+        tl = secp256k1_u128_to_u64(&t); \
    } \
    c0 += tl;                 /* overflow is handled on the next line */ \
    th += (c0 < tl);          /* at most 0xFFFFFFFFFFFFFFFF */ \
@ -216,9 +237,10 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
 #define muladd_fast(a,b) { \
    uint64_t tl, th; \
    { \
-        uint128_t t = (uint128_t)a * b; \
-        th = t >> 64;         /* at most 0xFFFFFFFFFFFFFFFE */ \
-        tl = t; \
+        secp256k1_uint128 t; \
+        secp256k1_u128_mul(&t, a, b); \
+        th = secp256k1_u128_hi_u64(&t);  /* at most 0xFFFFFFFFFFFFFFFE */ \
+        tl = secp256k1_u128_to_u64(&t); \
    } \
    c0 += tl;                 /* overflow is handled on the next line */ \
    th += (c0 < tl);          /* at most 0xFFFFFFFFFFFFFFFF */ \
@ -518,8 +540,8 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l)
    : "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1)
    : "rax", "rdx", "r8", "r9", "r10", "cc", "memory");
 #else
-    uint128_t c;
-    uint64_t c0, c1, c2;
+    secp256k1_uint128 c128;
+    uint64_t c, c0, c1, c2;
    uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
    uint64_t m0, m1, m2, m3, m4, m5;
    uint32_t m6;
@ -576,14 +598,18 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l)

    /* Reduce 258 bits into 256. */
    /* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */
-    c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
-    r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
-    c += p1 + (uint128_t)SECP256K1_N_C_1 * p4;
-    r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
-    c += p2 + (uint128_t)p4;
-    r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
-    c += p3;
-    r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
+    secp256k1_u128_from_u64(&c128, p0);
+    secp256k1_u128_accum_mul(&c128, SECP256K1_N_C_0, p4);
+    r->d[0] = secp256k1_u128_to_u64(&c128); secp256k1_u128_rshift(&c128, 64);
+    secp256k1_u128_accum_u64(&c128, p1);
+    secp256k1_u128_accum_mul(&c128, SECP256K1_N_C_1, p4);
+    r->d[1] = secp256k1_u128_to_u64(&c128); secp256k1_u128_rshift(&c128, 64);
+    secp256k1_u128_accum_u64(&c128, p2);
+    secp256k1_u128_accum_u64(&c128, p4);
+    r->d[2] = secp256k1_u128_to_u64(&c128); secp256k1_u128_rshift(&c128, 64);
+    secp256k1_u128_accum_u64(&c128, p3);
+    r->d[3] = secp256k1_u128_to_u64(&c128);
+    c = secp256k1_u128_hi_u64(&c128);
 #endif

    /* Final reduction of r. */
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@ -22,6 +22,7 @@

 #include "assumptions.h"
 #include "util.h"
+
 #include "field_impl.h"
 #include "scalar_impl.h"
 #include "group_impl.h"
@ -32,6 +33,7 @@
 #include "ecdsa_impl.h"
 #include "eckey_impl.h"
 #include "hash_impl.h"
+#include "int128_impl.h"
 #include "scratch_impl.h"
 #include "selftest.h"

--- a/src/tests.c
+++ b/src/tests.c
@ -26,6 +26,7 @@
 #include "modinv32_impl.h"
 #ifdef SECP256K1_WIDEMUL_INT128
 #include "modinv64_impl.h"
+#include "int128_impl.h"
 #endif

 #define CONDITIONAL_TEST(cnt, nam) if (count < (cnt)) { printf("Skipping %s (iteration count too low)\n", nam); } else
@ -476,6 +477,7 @@ void run_scratch_tests(void) {
    secp256k1_context_destroy(none);
 }

+
 void run_ctz_tests(void) {
    static const uint32_t b32[] = {1, 0xffffffff, 0x5e56968f, 0xe0d63129};
    static const uint64_t b64[] = {1, 0xffffffffffffffff, 0xbcd02462139b3fc3, 0x98b5f80c769693ef};
@ -860,7 +862,8 @@ uint64_t modinv2p64(uint64_t x) {
    return w;
 }

-/* compute out = (a*b) mod m; if b=NULL, treat b=1.
+
+/* compute out = (a*b) mod m; if b=NULL, treat b=1; if m=NULL, treat m=infinity.
 *
 * Out is a 512-bit number (represented as 32 uint16_t's in LE order). The other
 * arguments are 256-bit numbers (represented as 16 uint16_t's in LE order). */
@ -902,45 +905,47 @@ void mulmod256(uint16_t* out, const uint16_t* a, const uint16_t* b, const uint16
        }
    }

-    /* Compute the highest set bit in m. */
-    for (i = 255; i >= 0; --i) {
-        if ((m[i >> 4] >> (i & 15)) & 1) {
-            m_bitlen = i;
-            break;
-        }
-    }
-
-    /* Try do mul -= m<<i, for i going down to 0, whenever the result is not negative */
-    for (i = mul_bitlen - m_bitlen; i >= 0; --i) {
-        uint16_t mul2[32];
-        int64_t cs;
-
-        /* Compute mul2 = mul - m<<i. */
-        cs = 0; /* accumulator */
-        for (j = 0; j < 32; ++j) { /* j loops over the output limbs in mul2. */
-            /* Compute sub: the 16 bits in m that will be subtracted from mul2[j]. */
-            uint16_t sub = 0;
-            int p;
-            for (p = 0; p < 16; ++p) { /* p loops over the bit positions in mul2[j]. */
-                int bitpos = j * 16 - i + p; /* bitpos is the correspond bit position in m. */
-                if (bitpos >= 0 && bitpos < 256) {
-                    sub |= ((m[bitpos >> 4] >> (bitpos & 15)) & 1) << p;
-                }
+    if (m) {
+        /* Compute the highest set bit in m. */
+        for (i = 255; i >= 0; --i) {
+            if ((m[i >> 4] >> (i & 15)) & 1) {
+                m_bitlen = i;
+                break;
            }
-            /* Add mul[j]-sub to accumulator, and shift bottom 16 bits out to mul2[j]. */
-            cs += mul[j];
-            cs -= sub;
-            mul2[j] = (cs & 0xFFFF);
-            cs >>= 16;
        }
-        /* If remainder of subtraction is 0, set mul = mul2. */
-        if (cs == 0) {
-            memcpy(mul, mul2, sizeof(mul));
+
+        /* Try do mul -= m<<i, for i going down to 0, whenever the result is not negative */
+        for (i = mul_bitlen - m_bitlen; i >= 0; --i) {
+            uint16_t mul2[32];
+            int64_t cs;
+
+            /* Compute mul2 = mul - m<<i. */
+            cs = 0; /* accumulator */
+            for (j = 0; j < 32; ++j) { /* j loops over the output limbs in mul2. */
+                /* Compute sub: the 16 bits in m that will be subtracted from mul2[j]. */
+                uint16_t sub = 0;
+                int p;
+                for (p = 0; p < 16; ++p) { /* p loops over the bit positions in mul2[j]. */
+                    int bitpos = j * 16 - i + p; /* bitpos is the correspond bit position in m. */
+                    if (bitpos >= 0 && bitpos < 256) {
+                        sub |= ((m[bitpos >> 4] >> (bitpos & 15)) & 1) << p;
+                    }
+                }
+                /* Add mul[j]-sub to accumulator, and shift bottom 16 bits out to mul2[j]. */
+                cs += mul[j];
+                cs -= sub;
+                mul2[j] = (cs & 0xFFFF);
+                cs >>= 16;
+            }
+            /* If remainder of subtraction is 0, set mul = mul2. */
+            if (cs == 0) {
+                memcpy(mul, mul2, sizeof(mul));
+            }
+        }
+        /* Sanity check: test that all limbs higher than m's highest are zero */
+        for (i = (m_bitlen >> 4) + 1; i < 32; ++i) {
+            CHECK(mul[i] == 0);
        }
-    }
-    /* Sanity check: test that all limbs higher than m's highest are zero */
-    for (i = (m_bitlen >> 4) + 1; i < 32; ++i) {
-        CHECK(mul[i] == 0);
    }
    memcpy(out, mul, 32);
 }
@ -1756,8 +1761,305 @@ void run_modinv_tests(void) {
    }
 }

-/***** SCALAR TESTS *****/
+/***** INT128 TESTS *****/

+#ifdef SECP256K1_WIDEMUL_INT128
+/* Add two 256-bit numbers (represented as 16 uint16_t's in LE order) together mod 2^256. */
+void add256(uint16_t* out, const uint16_t* a, const uint16_t* b) {
+    int i;
+    uint32_t carry = 0;
+    for (i = 0; i < 16; ++i) {
+        carry += a[i];
+        carry += b[i];
+        out[i] = carry;
+        carry >>= 16;
+    }
+}
+
+/* Negate a 256-bit number (represented as 16 uint16_t's in LE order) mod 2^256. */
+void neg256(uint16_t* out, const uint16_t* a) {
+    int i;
+    uint32_t carry = 1;
+    for (i = 0; i < 16; ++i) {
+        carry += (uint16_t)~a[i];
+        out[i] = carry;
+        carry >>= 16;
+    }
+}
+
+/* Right-shift a 256-bit number (represented as 16 uint16_t's in LE order). */
+void rshift256(uint16_t* out, const uint16_t* a, int n, int sign_extend) {
+    uint16_t sign = sign_extend && (a[15] >> 15);
+    int i, j;
+    for (i = 15; i >= 0; --i) {
+        uint16_t v = 0;
+        for (j = 0; j < 16; ++j) {
+            int frompos = i*16 + j + n;
+            if (frompos >= 256) {
+                v |= sign << j;
+            } else {
+                v |= ((uint16_t)((a[frompos >> 4] >> (frompos & 15)) & 1)) << j;
+            }
+        }
+        out[i] = v;
+    }
+}
+
+/* Load a 64-bit unsigned integer into an array of 16 uint16_t's in LE order representing a 256-bit value. */
+void load256u64(uint16_t* out, uint64_t v, int is_signed) {
+    int i;
+    uint64_t sign = is_signed && (v >> 63) ? UINT64_MAX : 0;
+    for (i = 0; i < 4; ++i) {
+        out[i] = v >> (16 * i);
+    }
+    for (i = 4; i < 16; ++i) {
+        out[i] = sign;
+    }
+}
+
+/* Load a 128-bit unsigned integer into an array of 16 uint16_t's in LE order representing a 256-bit value. */
+void load256two64(uint16_t* out, uint64_t hi, uint64_t lo, int is_signed) {
+    int i;
+    uint64_t sign = is_signed && (hi >> 63) ? UINT64_MAX : 0;
+    for (i = 0; i < 4; ++i) {
+        out[i] = lo >> (16 * i);
+    }
+    for (i = 4; i < 8; ++i) {
+        out[i] = hi >> (16 * (i - 4));
+    }
+    for (i = 8; i < 16; ++i) {
+        out[i] = sign;
+    }
+}
+
+/* Check whether the 256-bit value represented by array of 16-bit values is in range -2^127 < v < 2^127. */
+int int256is127(const uint16_t* v) {
+    int all_0 = ((v[7] & 0x8000) == 0), all_1 = ((v[7] & 0x8000) == 0x8000);
+    int i;
+    for (i = 8; i < 16; ++i) {
+        if (v[i] != 0) all_0 = 0;
+        if (v[i] != 0xffff) all_1 = 0;
+    }
+    return all_0 || all_1;
+}
+
+void load256u128(uint16_t* out, const secp256k1_uint128* v) {
+    uint64_t lo = secp256k1_u128_to_u64(v), hi = secp256k1_u128_hi_u64(v);
+    load256two64(out, hi, lo, 0);
+}
+
+void load256i128(uint16_t* out, const secp256k1_int128* v) {
+    uint64_t lo;
+    int64_t hi;
+    secp256k1_int128 c = *v;
+    lo = secp256k1_i128_to_i64(&c);
+    secp256k1_i128_rshift(&c, 64);
+    hi = secp256k1_i128_to_i64(&c);
+    load256two64(out, hi, lo, 1);
+}
+
+void run_int128_test_case(void) {
+    unsigned char buf[32];
+    uint64_t v[4];
+    secp256k1_int128 swa, swz;
+    secp256k1_uint128 uwa, uwz;
+    uint64_t ub, uc;
+    int64_t sb, sc;
+    uint16_t rswa[16], rswz[32], rswr[32], ruwa[16], ruwz[32], ruwr[32];
+    uint16_t rub[16], ruc[16], rsb[16], rsc[16];
+    int i;
+
+    /* Generate 32-byte random value. */
+    secp256k1_testrand256_test(buf);
+    /* Convert into 4 64-bit integers. */
+    for (i = 0; i < 4; ++i) {
+        uint64_t vi = 0;
+        int j;
+        for (j = 0; j < 8; ++j) vi = (vi << 8) + buf[8*i + j];
+        v[i] = vi;
+    }
+    /* Convert those into a 128-bit value and two 64-bit values (signed and unsigned). */
+    secp256k1_u128_load(&uwa, v[1], v[0]);
+    secp256k1_i128_load(&swa, v[1], v[0]);
+    ub = v[2];
+    sb = v[2];
+    uc = v[3];
+    sc = v[3];
+    /* Load those also into 16-bit array representations. */
+    load256u128(ruwa, &uwa);
+    load256i128(rswa, &swa);
+    load256u64(rub, ub, 0);
+    load256u64(rsb, sb, 1);
+    load256u64(ruc, uc, 0);
+    load256u64(rsc, sc, 1);
+    /* test secp256k1_u128_mul */
+    mulmod256(ruwr, rub, ruc, NULL);
+    secp256k1_u128_mul(&uwz, ub, uc);
+    load256u128(ruwz, &uwz);
+    CHECK(secp256k1_memcmp_var(ruwr, ruwz, 16) == 0);
+    /* test secp256k1_u128_accum_mul */
+    mulmod256(ruwr, rub, ruc, NULL);
+    add256(ruwr, ruwr, ruwa);
+    uwz = uwa;
+    secp256k1_u128_accum_mul(&uwz, ub, uc);
+    load256u128(ruwz, &uwz);
+    CHECK(secp256k1_memcmp_var(ruwr, ruwz, 16) == 0);
+    /* test secp256k1_u128_accum_u64 */
+    add256(ruwr, rub, ruwa);
+    uwz = uwa;
+    secp256k1_u128_accum_u64(&uwz, ub);
+    load256u128(ruwz, &uwz);
+    CHECK(secp256k1_memcmp_var(ruwr, ruwz, 16) == 0);
+    /* test secp256k1_u128_rshift */
+    rshift256(ruwr, ruwa, uc % 128, 0);
+    uwz = uwa;
+    secp256k1_u128_rshift(&uwz, uc % 128);
+    load256u128(ruwz, &uwz);
+    CHECK(secp256k1_memcmp_var(ruwr, ruwz, 16) == 0);
+    /* test secp256k1_u128_to_u64 */
+    CHECK(secp256k1_u128_to_u64(&uwa) == v[0]);
+    /* test secp256k1_u128_hi_u64 */
+    CHECK(secp256k1_u128_hi_u64(&uwa) == v[1]);
+    /* test secp256k1_u128_from_u64 */
+    secp256k1_u128_from_u64(&uwz, ub);
+    load256u128(ruwz, &uwz);
+    CHECK(secp256k1_memcmp_var(rub, ruwz, 16) == 0);
+    /* test secp256k1_u128_check_bits */
+    {
+        int uwa_bits = 0;
+        int j;
+        for (j = 0; j < 128; ++j) {
+            if (ruwa[j / 16] >> (j % 16)) uwa_bits = 1 + j;
+        }
+        for (j = 0; j < 128; ++j) {
+            CHECK(secp256k1_u128_check_bits(&uwa, j) == (uwa_bits <= j));
+        }
+    }
+    /* test secp256k1_i128_mul */
+    mulmod256(rswr, rsb, rsc, NULL);
+    secp256k1_i128_mul(&swz, sb, sc);
+    load256i128(rswz, &swz);
+    CHECK(secp256k1_memcmp_var(rswr, rswz, 16) == 0);
+    /* test secp256k1_i128_accum_mul */
+    mulmod256(rswr, rsb, rsc, NULL);
+    add256(rswr, rswr, rswa);
+    if (int256is127(rswr)) {
+        swz = swa;
+        secp256k1_i128_accum_mul(&swz, sb, sc);
+        load256i128(rswz, &swz);
+        CHECK(secp256k1_memcmp_var(rswr, rswz, 16) == 0);
+    }
+    /* test secp256k1_i128_det */
+    {
+        uint16_t rsd[16], rse[16], rst[32];
+        int64_t sd = v[0], se = v[1];
+        load256u64(rsd, sd, 1);
+        load256u64(rse, se, 1);
+        mulmod256(rst, rsc, rsd, NULL);
+        neg256(rst, rst);
+        mulmod256(rswr, rsb, rse, NULL);
+        add256(rswr, rswr, rst);
+        secp256k1_i128_det(&swz, sb, sc, sd, se);
+        load256i128(rswz, &swz);
+        CHECK(secp256k1_memcmp_var(rswr, rswz, 16) == 0);
+    }
+    /* test secp256k1_i128_rshift */
+    rshift256(rswr, rswa, uc % 127, 1);
+    swz = swa;
+    secp256k1_i128_rshift(&swz, uc % 127);
+    load256i128(rswz, &swz);
+    CHECK(secp256k1_memcmp_var(rswr, rswz, 16) == 0);
+    /* test secp256k1_i128_to_i64 */
+    CHECK((uint64_t)secp256k1_i128_to_i64(&swa) == v[0]);
+    /* test secp256k1_i128_from_i64 */
+    secp256k1_i128_from_i64(&swz, sb);
+    load256i128(rswz, &swz);
+    CHECK(secp256k1_memcmp_var(rsb, rswz, 16) == 0);
+    /* test secp256k1_i128_eq_var */
+    {
+        int expect = (uc & 1);
+        swz = swa;
+        if (!expect) {
+            /* Make sure swz != swa */
+            uint64_t v0c = v[0], v1c = v[1];
+            if (ub & 64) {
+                v1c ^= (((uint64_t)1) << (ub & 63));
+            } else {
+                v0c ^= (((uint64_t)1) << (ub & 63));
+            }
+            secp256k1_i128_load(&swz, v1c, v0c);
+        }
+        CHECK(secp256k1_i128_eq_var(&swa, &swz) == expect);
+    }
+    /* test secp256k1_i128_check_pow2 */
+    {
+        int expect = (uc & 1);
+        int pos = ub % 127;
+        if (expect) {
+            /* If expect==1, set swz to exactly (2 << pos). */
+            uint64_t hi = 0;
+            uint64_t lo = 0;
+            if (pos & 64) {
+                hi = (((uint64_t)1) << (pos & 63));
+            } else {
+                lo = (((uint64_t)1) << (pos & 63));
+            }
+            secp256k1_i128_load(&swz, hi, lo);
+        } else {
+            /* If expect==0, set swz = swa, but update expect=1 if swa happens to equal (2 << pos). */
+            if (pos & 64) {
+                if ((v[1] == (((uint64_t)1) << (pos & 63))) && v[0] == 0) expect = 1;
+            } else {
+                if ((v[0] == (((uint64_t)1) << (pos & 63))) && v[1] == 0) expect = 1;
+            }
+            swz = swa;
+        }
+        CHECK(secp256k1_i128_check_pow2(&swz, pos) == expect);
+    }
+}
+
+void run_int128_tests(void) {
+    {   /* secp256k1_u128_accum_mul */
+        secp256k1_uint128 res;
+
+        /* Check secp256k1_u128_accum_mul overflow */
+        secp256k1_u128_mul(&res, UINT64_MAX, UINT64_MAX);
+        secp256k1_u128_accum_mul(&res, UINT64_MAX, UINT64_MAX);
+        CHECK(secp256k1_u128_to_u64(&res) == 2);
+        CHECK(secp256k1_u128_hi_u64(&res) == 18446744073709551612U);
+    }
+    {   /* secp256k1_u128_accum_mul */
+        secp256k1_int128 res;
+
+        /* Compute INT128_MAX = 2^127 - 1 with secp256k1_i128_accum_mul */
+        secp256k1_i128_mul(&res, INT64_MAX, INT64_MAX);
+        secp256k1_i128_accum_mul(&res, INT64_MAX, INT64_MAX);
+        CHECK(secp256k1_i128_to_i64(&res) == 2);
+        secp256k1_i128_accum_mul(&res, 4, 9223372036854775807);
+        secp256k1_i128_accum_mul(&res, 1, 1);
+        CHECK((uint64_t)secp256k1_i128_to_i64(&res) == UINT64_MAX);
+        secp256k1_i128_rshift(&res, 64);
+        CHECK(secp256k1_i128_to_i64(&res) == INT64_MAX);
+
+        /* Compute INT128_MIN = - 2^127 with secp256k1_i128_accum_mul */
+        secp256k1_i128_mul(&res, INT64_MAX, INT64_MIN);
+        CHECK(secp256k1_i128_to_i64(&res) == INT64_MIN);
+        secp256k1_i128_accum_mul(&res, INT64_MAX, INT64_MIN);
+        CHECK(secp256k1_i128_to_i64(&res) == 0);
+        secp256k1_i128_accum_mul(&res, 2, INT64_MIN);
+        CHECK(secp256k1_i128_to_i64(&res) == 0);
+        secp256k1_i128_rshift(&res, 64);
+        CHECK(secp256k1_i128_to_i64(&res) == INT64_MIN);
+    }
+    {
+        /* Randomized tests. */
+        int i;
+        for (i = 0; i < 256 * count; ++i) run_int128_test_case();
+    }
+}
+#endif
+
+/***** SCALAR TESTS *****/

 void scalar_test(void) {
    secp256k1_scalar s;
@ -7409,6 +7711,9 @@ int main(int argc, char **argv) {
    run_rand_int();
    run_util_tests();

+#ifdef SECP256K1_WIDEMUL_INT128
+    run_int128_tests();
+#endif
    run_ctz_tests();
    run_modinv_tests();
    run_inverse_tests();
--- a/src/util.h
+++ b/src/util.h
@ -281,28 +281,36 @@ static SECP256K1_INLINE void secp256k1_int_cmov(int *r, const int *a, int flag)
    *r = (int)(r_masked | a_masked);
 }

-/* If USE_FORCE_WIDEMUL_{INT128,INT64} is set, use that wide multiplication implementation.
- * Otherwise use the presence of __SIZEOF_INT128__ to decide.
- */
-#if defined(USE_FORCE_WIDEMUL_INT128)
+#if defined(USE_FORCE_WIDEMUL_INT128_STRUCT)
+/* If USE_FORCE_WIDEMUL_INT128_STRUCT is set, use int128_struct. */
 # define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_STRUCT 1
+#elif defined(USE_FORCE_WIDEMUL_INT128)
+/* If USE_FORCE_WIDEMUL_INT128 is set, use int128. */
+# define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_NATIVE 1
 #elif defined(USE_FORCE_WIDEMUL_INT64)
+/* If USE_FORCE_WIDEMUL_INT64 is set, use int64. */
 # define SECP256K1_WIDEMUL_INT64 1
 #elif defined(UINT128_MAX) || defined(__SIZEOF_INT128__)
+/* If a native 128-bit integer type exists, use int128. */
 # define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_NATIVE 1
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
+/* On 64-bit MSVC targets (x86_64 and arm64), use int128_struct
+ * (which has special logic to implement using intrinsics on those systems). */
+# define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_STRUCT 1
+#elif SIZE_MAX > 0xffffffff
+/* Systems with 64-bit pointers (and thus registers) very likely benefit from
+ * using 64-bit based arithmetic (even if we need to fall back to 32x32->64 based
+ * multiplication logic). */
+# define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_STRUCT 1
 #else
+/* Lastly, fall back to int64 based arithmetic. */
 # define SECP256K1_WIDEMUL_INT64 1
 #endif
-#if defined(SECP256K1_WIDEMUL_INT128)
-# if !defined(UINT128_MAX) && defined(__SIZEOF_INT128__)
-SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
-SECP256K1_GNUC_EXT typedef __int128 int128_t;
-#define UINT128_MAX ((uint128_t)(-1))
-#define INT128_MAX ((int128_t)(UINT128_MAX >> 1))
-#define INT128_MIN (-INT128_MAX - 1)
-/* No (U)INT128_C macros because compilers providing __int128 do not support 128-bit literals.  */
-# endif
-#endif

 #ifndef __has_builtin
 #define __has_builtin(x) 0