Merge bitcoin-core/secp256k1#1184: Signed-digit based ecmult_const algorithm

355bbdf38a2f932daadd02325a0d90d902cb2af4 Add changelog entry for signed-digit ecmult_const algorithm (Pieter Wuille) 21f49d9bec518a769029f809817444a984e735ab Remove unused secp256k1_scalar_shr_int (Pieter Wuille) 115fdc7232a80872c99f88589a5a3608ba757f1d Remove unused secp256k1_wnaf_const (Pieter Wuille) aa9f3a3c004469033709dc8138892e66adf0b030 ecmult_const: add/improve tests (Jonas Nick) 4d16e90111c050de3b7e25ac451d87cd4e3f874e Signed-digit based ecmult_const algorithm (Pieter Wuille) ba523be067d6e45957d154838cb9da942704f01a make SECP256K1_SCALAR_CONST reduce modulo exhaustive group order (Pieter Wuille) 2140da9cd5d490d8462d5c7cc909755edc10c1e6 Add secp256k1_scalar_half for halving scalars (+ tests/benchmarks). (Pieter Wuille) Pull request description: Using some insights learned from #1058, this replaces the fixed-wnaf ecmult_const algorithm with a signed-digit based one. Conceptually both algorithms are very similar, in that they boil down to summing precomputed odd multiples of the input points. Practically however, the new algorithm is simpler because it's just using scalar operations, rather than relying on wnaf machinery with skew terms to guarantee odd multipliers. The idea is that we can compute $q \cdot A$ as follows: * Let $s = f(q)$, for some function $f()$. * Compute $(s_1, s_2)$ such that $s = s_1 + \lambda s_2$, using `secp256k1_scalar_lambda_split`. * Let $v_1 = s_1 + 2^{128}$ and $v_2 = s_2 + 2^{128}$ (such that the $v_i$ are positive and $n$ bits long). * Computing the result as $$\sum_{i=0}^{n-1} (2v_1[i]-1) 2^i A + \sum_{i=0}^{n-1} (2v_2[i]-1) 2^i \lambda A$$ where $x[i]$ stands for the *i*'th bit of $x$, so summing positive and negative powers of two times $A$, based on the bits of $v_1.$ The comments in `ecmult_const_impl.h` show that if $f(q) = (q + (1+\lambda)(2^n - 2^{129} - 1))/2 \mod n$, the result will equal $q \cdot A$. This last step can be performed in groups of multiple bits at once, by looking up entries in a precomputed table of odd multiples of $A$ and $\lambda A$, and then multiplying by a power of two before proceeding to the next group. The result is slightly faster (I measure ~2% speedup), but significantly simpler as it only uses scalar arithmetic to determine the table lookup values. The speedup is due to the fact that no skew corrections at the end are needed, and less overhead to determine table indices. The precomputed table sizes are also made independent from the `ecmult` ones, after observing that the optimal table size is bigger here (which also gives a small speedup). ACKs for top commit: jonasnick: ACK 355bbdf38a2f932daadd02325a0d90d902cb2af4 siv2r: ACK 355bbdf real-or-random: ACK 355bbdf38a2f932daadd02325a0d90d902cb2af4 Tree-SHA512: 13db572cb7f9be00bf0931c65fcd8bc8b5545be86a8c8700bd6a79ad9e4d9e5e79e7f763f92ca6a91d9717a355f8162204b0ea821b6ae99d58cb400497ddc656
2023-11-07 23:18:37 +01:00 · 2023-11-07 23:18:37 +01:00 · 40f50d0fbd
commit 40f50d0fbd
parent 1f1bb78b7f 355bbdf38a
9 changed files with 437 additions and 335 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+#### Changed
+ - The point multiplication algorithm used for ECDH operations (module `ecdh`) was replaced with a slightly faster one.
+
 ## [0.4.0] - 2023-09-04

 #### Added
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@ -14,7 +14,6 @@
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
-#include "ecmult_const_impl.h"
 #include "ecmult_impl.h"
 #include "bench.h"

@ -98,6 +97,18 @@ static void bench_scalar_negate(void* arg, int iters) {
    }
 }

+static void bench_scalar_half(void* arg, int iters) {
+    int i;
+    bench_inv *data = (bench_inv*)arg;
+    secp256k1_scalar s = data->scalar[0];
+
+    for (i = 0; i < iters; i++) {
+        secp256k1_scalar_half(&s, &s);
+    }
+
+    data->scalar[0] = s;
+}
+
 static void bench_scalar_mul(void* arg, int iters) {
    int i;
    bench_inv *data = (bench_inv*)arg;
@ -309,18 +320,6 @@ static void bench_ecmult_wnaf(void* arg, int iters) {
    CHECK(bits <= 256*iters);
 }

-static void bench_wnaf_const(void* arg, int iters) {
-    int i, bits = 0, overflow = 0;
-    bench_inv *data = (bench_inv*)arg;
-
-    for (i = 0; i < iters; i++) {
-        bits += secp256k1_wnaf_const(data->wnaf, &data->scalar[0], WINDOW_A, 256);
-        overflow += secp256k1_scalar_add(&data->scalar[0], &data->scalar[0], &data->scalar[1]);
-    }
-    CHECK(overflow >= 0);
-    CHECK(bits <= 256*iters);
-}
-
 static void bench_sha256(void* arg, int iters) {
    int i;
    bench_inv *data = (bench_inv*)arg;
@ -370,6 +369,7 @@ int main(int argc, char **argv) {
    int d = argc == 1; /* default */
    print_output_table_header_row();

+    if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "half")) run_benchmark("scalar_half", bench_scalar_half, bench_setup, NULL, &data, 10, iters*100);
    if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "add")) run_benchmark("scalar_add", bench_scalar_add, bench_setup, NULL, &data, 10, iters*100);
    if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, iters*100);
    if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, iters*10);
@ -394,7 +394,6 @@ int main(int argc, char **argv) {
    if (d || have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_zinv_var", bench_group_add_zinv_var, bench_setup, NULL, &data, 10, iters*10);
    if (d || have_flag(argc, argv, "group") || have_flag(argc, argv, "to_affine")) run_benchmark("group_to_affine_var", bench_group_to_affine_var, bench_setup, NULL, &data, 10, iters);

-    if (d || have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("wnaf_const", bench_wnaf_const, bench_setup, NULL, &data, 10, iters);
    if (d || have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("ecmult_wnaf", bench_ecmult_wnaf, bench_setup, NULL, &data, 10, iters);

    if (d || have_flag(argc, argv, "hash") || have_flag(argc, argv, "sha256")) run_benchmark("hash_sha256", bench_sha256, bench_setup, NULL, &data, 10, iters);
--- a/src/ecmult_const_impl.h
+++ b/src/ecmult_const_impl.h
@ -1,5 +1,5 @@
 /***********************************************************************
- * Copyright (c) 2015 Pieter Wuille, Andrew Poelstra                   *
+ * Copyright (c) 2015, 2022 Pieter Wuille, Andrew Poelstra             *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
@ -12,208 +12,261 @@
 #include "ecmult_const.h"
 #include "ecmult_impl.h"

+#if defined(EXHAUSTIVE_TEST_ORDER)
+/* We need 2^ECMULT_CONST_GROUP_SIZE - 1 to be less than EXHAUSTIVE_TEST_ORDER, because
+ * the tables cannot have infinities in them (this breaks the effective-affine technique's
+ * z-ratio tracking) */
+#  if EXHAUSTIVE_TEST_ORDER == 199
+#    define ECMULT_CONST_GROUP_SIZE 4
+#  elif EXHAUSTIVE_TEST_ORDER == 13
+#    define ECMULT_CONST_GROUP_SIZE 3
+#  elif EXHAUSTIVE_TEST_ORDER == 7
+#    define ECMULT_CONST_GROUP_SIZE 2
+#  else
+#    error "Unknown EXHAUSTIVE_TEST_ORDER"
+#  endif
+#else
+/* Group size 4 or 5 appears optimal. */
+#  define ECMULT_CONST_GROUP_SIZE 5
+#endif
+
+#define ECMULT_CONST_TABLE_SIZE (1L << (ECMULT_CONST_GROUP_SIZE - 1))
+#define ECMULT_CONST_GROUPS ((129 + ECMULT_CONST_GROUP_SIZE - 1) / ECMULT_CONST_GROUP_SIZE)
+#define ECMULT_CONST_BITS (ECMULT_CONST_GROUPS * ECMULT_CONST_GROUP_SIZE)
+
 /** Fill a table 'pre' with precomputed odd multiples of a.
 *
 *  The resulting point set is brought to a single constant Z denominator, stores the X and Y
- *  coordinates as ge_storage points in pre, and stores the global Z in globalz.
- *  It only operates on tables sized for WINDOW_A wnaf multiples.
+ *  coordinates as ge points in pre, and stores the global Z in globalz.
+ *
+ *  'pre' must be an array of size ECMULT_CONST_TABLE_SIZE.
 */
-static void secp256k1_ecmult_odd_multiples_table_globalz_windowa(secp256k1_ge *pre, secp256k1_fe *globalz, const secp256k1_gej *a) {
-    secp256k1_fe zr[ECMULT_TABLE_SIZE(WINDOW_A)];
+static void secp256k1_ecmult_const_odd_multiples_table_globalz(secp256k1_ge *pre, secp256k1_fe *globalz, const secp256k1_gej *a) {
+    secp256k1_fe zr[ECMULT_CONST_TABLE_SIZE];

-    secp256k1_ecmult_odd_multiples_table(ECMULT_TABLE_SIZE(WINDOW_A), pre, zr, globalz, a);
-    secp256k1_ge_table_set_globalz(ECMULT_TABLE_SIZE(WINDOW_A), pre, zr);
+    secp256k1_ecmult_odd_multiples_table(ECMULT_CONST_TABLE_SIZE, pre, zr, globalz, a);
+    secp256k1_ge_table_set_globalz(ECMULT_CONST_TABLE_SIZE, pre, zr);
 }

-/* This is like `ECMULT_TABLE_GET_GE` but is constant time */
-#define ECMULT_CONST_TABLE_GET_GE(r,pre,n,w) do { \
-    int m = 0; \
-    /* Extract the sign-bit for a constant time absolute-value. */ \
-    int volatile mask = (n) >> (sizeof(n) * CHAR_BIT - 1); \
-    int abs_n = ((n) + mask) ^ mask; \
-    int idx_n = abs_n >> 1; \
+/* Given a table 'pre' with odd multiples of a point, put in r the signed-bit multiplication of n with that point.
+ *
+ * For example, if ECMULT_CONST_GROUP_SIZE is 4, then pre is expected to contain 8 entries:
+ * [1*P, 3*P, 5*P, 7*P, 9*P, 11*P, 13*P, 15*P]. n is then expected to be a 4-bit integer (range 0-15), and its
+ * bits are interpreted as signs of powers of two to look up.
+ *
+ * For example, if n=4, which is 0100 in binary, which is interpreted as [- + - -], so the looked up value is
+ * [ -(2^3) + (2^2) - (2^1) - (2^0) ]*P = -7*P. Every valid n translates to an odd number in range [-15,15],
+ * which means we just need to look up one of the precomputed values, and optionally negate it.
+ */
+#define ECMULT_CONST_TABLE_GET_GE(r,pre,n) do { \
+    unsigned int m = 0; \
+    /* If the top bit of n is 0, we want the negation. */ \
+    volatile unsigned int negative = ((n) >> (ECMULT_CONST_GROUP_SIZE - 1)) ^ 1; \
+    /* Let n[i] be the i-th bit of n, then the index is
+     *     sum(cnot(n[i]) * 2^i, i=0..l-2)
+     * where cnot(b) = b if n[l-1] = 1 and 1 - b otherwise.
+     * For example, if n = 4, in binary 0100, the index is 3, in binary 011.
+     *
+     * Proof:
+     *     Let
+     *         x = sum((2*n[i] - 1)*2^i, i=0..l-1)
+     *           = 2*sum(n[i] * 2^i, i=0..l-1) - 2^l + 1
+     *     be the value represented by n.
+     *     The index is (x - 1)/2 if x > 0 and -(x + 1)/2 otherwise.
+     *     Case x > 0:
+     *         n[l-1] = 1
+     *         index = sum(n[i] * 2^i, i=0..l-1) - 2^(l-1)
+     *               = sum(n[i] * 2^i, i=0..l-2)
+     *     Case x <= 0:
+     *         n[l-1] = 0
+     *          index = -(2*sum(n[i] * 2^i, i=0..l-1) - 2^l + 2)/2
+     *                = 2^(l-1) - 1 - sum(n[i] * 2^i, i=0..l-1)
+     *                = sum((1 - n[i]) * 2^i, i=0..l-2)
+     */ \
+    unsigned int index = ((unsigned int)(-negative) ^ n) & ((1U << (ECMULT_CONST_GROUP_SIZE - 1)) - 1U); \
    secp256k1_fe neg_y; \
-    VERIFY_CHECK(((n) & 1) == 1); \
-    VERIFY_CHECK((n) >= -((1 << ((w)-1)) - 1)); \
-    VERIFY_CHECK((n) <=  ((1 << ((w)-1)) - 1)); \
+    VERIFY_CHECK((n) < (1U << ECMULT_CONST_GROUP_SIZE)); \
+    VERIFY_CHECK(index < (1U << (ECMULT_CONST_GROUP_SIZE - 1))); \
    VERIFY_SETUP(secp256k1_fe_clear(&(r)->x)); \
    VERIFY_SETUP(secp256k1_fe_clear(&(r)->y)); \
-    /* Unconditionally set r->x = (pre)[m].x. r->y = (pre)[m].y. because it's either the correct one \
+    /* Unconditionally set r->x = (pre)[m].x. r->y = (pre)[m].y. because it's either the correct one
     * or will get replaced in the later iterations, this is needed to make sure `r` is initialized. */ \
    (r)->x = (pre)[m].x; \
    (r)->y = (pre)[m].y; \
-    for (m = 1; m < ECMULT_TABLE_SIZE(w); m++) { \
+    for (m = 1; m < ECMULT_CONST_TABLE_SIZE; m++) { \
        /* This loop is used to avoid secret data in array indices. See
         * the comment in ecmult_gen_impl.h for rationale. */ \
-        secp256k1_fe_cmov(&(r)->x, &(pre)[m].x, m == idx_n); \
-        secp256k1_fe_cmov(&(r)->y, &(pre)[m].y, m == idx_n); \
+        secp256k1_fe_cmov(&(r)->x, &(pre)[m].x, m == index); \
+        secp256k1_fe_cmov(&(r)->y, &(pre)[m].y, m == index); \
    } \
    (r)->infinity = 0; \
    secp256k1_fe_negate(&neg_y, &(r)->y, 1); \
-    secp256k1_fe_cmov(&(r)->y, &neg_y, (n) != abs_n); \
+    secp256k1_fe_cmov(&(r)->y, &neg_y, negative); \
 } while(0)

-/** Convert a number to WNAF notation.
- *  The number becomes represented by sum(2^{wi} * wnaf[i], i=0..WNAF_SIZE(w)+1) - return_val.
- *  It has the following guarantees:
- *  - each wnaf[i] an odd integer between -(1 << w) and (1 << w)
- *  - each wnaf[i] is nonzero
- *  - the number of words set is always WNAF_SIZE(w) + 1
+/* For K as defined in the comment of secp256k1_ecmult_const, we have several precomputed
+ * formulas/constants.
+ * - in exhaustive test mode, we give an explicit expression to compute it at compile time: */
+#ifdef EXHAUSTIVE_TEST_ORDER
+static const secp256k1_scalar secp256k1_ecmult_const_K = ((SECP256K1_SCALAR_CONST(0, 0, 0, (1U << (ECMULT_CONST_BITS - 128)) - 2U, 0, 0, 0, 0) + EXHAUSTIVE_TEST_ORDER - 1U) * (1U + EXHAUSTIVE_TEST_LAMBDA)) % EXHAUSTIVE_TEST_ORDER;
+/* - for the real secp256k1 group we have constants for various ECMULT_CONST_BITS values. */
+#elif ECMULT_CONST_BITS == 129
+/* For GROUP_SIZE = 1,3. */
+static const secp256k1_scalar secp256k1_ecmult_const_K = SECP256K1_SCALAR_CONST(0xac9c52b3ul, 0x3fa3cf1ful, 0x5ad9e3fdul, 0x77ed9ba4ul, 0xa880b9fcul, 0x8ec739c2ul, 0xe0cfc810ul, 0xb51283ceul);
+#elif ECMULT_CONST_BITS == 130
+/* For GROUP_SIZE = 2,5. */
+static const secp256k1_scalar secp256k1_ecmult_const_K = SECP256K1_SCALAR_CONST(0xa4e88a7dul, 0xcb13034eul, 0xc2bdd6bful, 0x7c118d6bul, 0x589ae848ul, 0x26ba29e4ul, 0xb5c2c1dcul, 0xde9798d9ul);
+#elif ECMULT_CONST_BITS == 132
+/* For GROUP_SIZE = 4,6 */
+static const secp256k1_scalar secp256k1_ecmult_const_K = SECP256K1_SCALAR_CONST(0x76b1d93dul, 0x0fae3c6bul, 0x3215874bul, 0x94e93813ul, 0x7937fe0dul, 0xb66bcaaful, 0xb3749ca5ul, 0xd7b6171bul);
+#else
+#  error "Unknown ECMULT_CONST_BITS"
+#endif
+
+static void secp256k1_ecmult_const(secp256k1_gej *r, const secp256k1_ge *a, const secp256k1_scalar *q) {
+    /* The approach below combines the signed-digit logic from Mike Hamburg's
+     * "Fast and compact elliptic-curve cryptography" (https://eprint.iacr.org/2012/309)
+     * Section 3.3, with the GLV endomorphism.
     *
- *  Adapted from `The Width-w NAF Method Provides Small Memory and Fast Elliptic Scalar
- *  Multiplications Secure against Side Channel Attacks`, Okeya and Tagaki. M. Joye (Ed.)
- *  CT-RSA 2003, LNCS 2612, pp. 328-443, 2003. Springer-Verlag Berlin Heidelberg 2003
+     * The idea there is to interpret the bits of a scalar as signs (1 = +, 0 = -), and compute a
+     * point multiplication in that fashion. Let v be an n-bit non-negative integer (0 <= v < 2^n),
+     * and v[i] its i'th bit (so v = sum(v[i] * 2^i, i=0..n-1)). Then define:
     *
- *  Numbers reference steps of `Algorithm SPA-resistant Width-w NAF with Odd Scalar` on pp. 335
+     *   C_l(v, A) = sum((2*v[i] - 1) * 2^i*A, i=0..l-1)
+     *
+     * Then it holds that C_l(v, A) = sum((2*v[i] - 1) * 2^i*A, i=0..l-1)
+     *                              = (2*sum(v[i] * 2^i, i=0..l-1) + 1 - 2^l) * A
+     *                              = (2*v + 1 - 2^l) * A
+     *
+     * Thus, one can compute q*A as C_256((q + 2^256 - 1) / 2, A). This is the basis for the
+     * paper's signed-digit multi-comb algorithm for multiplication using a precomputed table.
+     *
+     * It is appealing to try to combine this with the GLV optimization: the idea that a scalar
+     * s can be written as s1 + lambda*s2, where lambda is a curve-specific constant such that
+     * lambda*A is easy to compute, and where s1 and s2 are small. In particular we have the
+     * secp256k1_scalar_split_lambda function which performs such a split with the resulting s1
+     * and s2 in range (-2^128, 2^128) mod n. This does work, but is uninteresting:
+     *
+     *   To compute q*A:
+     *   - Let s1, s2 = split_lambda(q)
+     *   - Let R1 = C_256((s1 + 2^256 - 1) / 2, A)
+     *   - Let R2 = C_256((s2 + 2^256 - 1) / 2, lambda*A)
+     *   - Return R1 + R2
+     *
+     * The issue is that while s1 and s2 are small-range numbers, (s1 + 2^256 - 1) / 2 (mod n)
+     * and (s2 + 2^256 - 1) / 2 (mod n) are not, undoing the benefit of the splitting.
+     *
+     * To make it work, we want to modify the input scalar q first, before splitting, and then only
+     * add a 2^128 offset of the split results (so that they end up in the single 129-bit range
+     * [0,2^129]). A slightly smaller offset would work due to the bounds on the split, but we pick
+     * 2^128 for simplicity. Let s be the scalar fed to split_lambda, and f(q) the function to
+     * compute it from q:
+     *
+     *   To compute q*A:
+     *   - Compute s = f(q)
+     *   - Let s1, s2 = split_lambda(s)
+     *   - Let v1 = s1 + 2^128 (mod n)
+     *   - Let v2 = s2 + 2^128 (mod n)
+     *   - Let R1 = C_l(v1, A)
+     *   - Let R2 = C_l(v2, lambda*A)
+     *   - Return R1 + R2
+     *
+     * l will thus need to be at least 129, but we may overshoot by a few bits (see
+     * further), so keep it as a variable.
+     *
+     * To solve for s, we reason:
+     *     q*A  = R1 + R2
+     * <=> q*A  = C_l(s1 + 2^128, A) + C_l(s2 + 2^128, lambda*A)
+     * <=> q*A  = (2*(s1 + 2^128) + 1 - 2^l) * A + (2*(s2 + 2^128) + 1 - 2^l) * lambda*A
+     * <=> q*A  = (2*(s1 + s2*lambda) + (2^129 + 1 - 2^l) * (1 + lambda)) * A
+     * <=> q    = 2*(s1 + s2*lambda) + (2^129 + 1 - 2^l) * (1 + lambda) (mod n)
+     * <=> q    = 2*s + (2^129 + 1 - 2^l) * (1 + lambda) (mod n)
+     * <=> s    = (q + (2^l - 2^129 - 1) * (1 + lambda)) / 2 (mod n)
+     * <=> f(q) = (q + K) / 2 (mod n)
+     *            where K = (2^l - 2^129 - 1)*(1 + lambda) (mod n)
+     *
+     * We will process the computation of C_l(v1, A) and C_l(v2, lambda*A) in groups of
+     * ECMULT_CONST_GROUP_SIZE, so we set l to the smallest multiple of ECMULT_CONST_GROUP_SIZE
+     * that is not less than 129; this equals ECMULT_CONST_BITS.
     */
-static int secp256k1_wnaf_const(int *wnaf, const secp256k1_scalar *scalar, int w, int size) {
-    int global_sign;
-    int skew;
-    int word = 0;

-    /* 1 2 3 */
-    int u_last;
-    int u;
-
-    int flip;
-    secp256k1_scalar s = *scalar;
-
-    VERIFY_CHECK(w > 0);
-    VERIFY_CHECK(size > 0);
-
-    /* Note that we cannot handle even numbers by negating them to be odd, as is
-     * done in other implementations, since if our scalars were specified to have
-     * width < 256 for performance reasons, their negations would have width 256
-     * and we'd lose any performance benefit. Instead, we use a variation of a
-     * technique from Section 4.2 of the Okeya/Tagaki paper, which is to add 1 to the
-     * number we are encoding when it is even, returning a skew value indicating
-     * this, and having the caller compensate after doing the multiplication.
-     *
-     * In fact, we _do_ want to negate numbers to minimize their bit-lengths (and in
-     * particular, to ensure that the outputs from the endomorphism-split fit into
-     * 128 bits). If we negate, the parity of our number flips, affecting whether
-     * we want to add to the scalar to ensure that it's odd. */
-    flip = secp256k1_scalar_is_high(&s);
-    skew = flip ^ secp256k1_scalar_is_even(&s);
-    secp256k1_scalar_cadd_bit(&s, 0, skew);
-    global_sign = secp256k1_scalar_cond_negate(&s, flip);
-
-    /* 4 */
-    u_last = secp256k1_scalar_shr_int(&s, w);
-    do {
-        int even;
-
-        /* 4.1 4.4 */
-        u = secp256k1_scalar_shr_int(&s, w);
-        /* 4.2 */
-        even = ((u & 1) == 0);
-        /* In contrast to the original algorithm, u_last is always > 0 and
-         * therefore we do not need to check its sign. In particular, it's easy
-         * to see that u_last is never < 0 because u is never < 0. Moreover,
-         * u_last is never = 0 because u is never even after a loop
-         * iteration. The same holds analogously for the initial value of
-         * u_last (in the first loop iteration). */
-        VERIFY_CHECK(u_last > 0);
-        VERIFY_CHECK((u_last & 1) == 1);
-        u += even;
-        u_last -= even * (1 << w);
-
-        /* 4.3, adapted for global sign change */
-        wnaf[word++] = u_last * global_sign;
-
-        u_last = u;
-    } while (word * w < size);
-    wnaf[word] = u * global_sign;
-
-    VERIFY_CHECK(secp256k1_scalar_is_zero(&s));
-    VERIFY_CHECK(word == WNAF_SIZE_BITS(size, w));
-    return skew;
-}
-
-static void secp256k1_ecmult_const(secp256k1_gej *r, const secp256k1_ge *a, const secp256k1_scalar *scalar) {
-    secp256k1_ge pre_a[ECMULT_TABLE_SIZE(WINDOW_A)];
-    secp256k1_ge tmpa;
-    secp256k1_fe Z;
-
-    int skew_1;
-    secp256k1_ge pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
-    int wnaf_lam[1 + WNAF_SIZE(WINDOW_A - 1)];
-    int skew_lam;
-    secp256k1_scalar q_1, q_lam;
-    int wnaf_1[1 + WNAF_SIZE(WINDOW_A - 1)];
-
-    int i;
+    /* The offset to add to s1 and s2 to make them non-negative. Equal to 2^128. */
+    static const secp256k1_scalar S_OFFSET = SECP256K1_SCALAR_CONST(0, 0, 0, 1, 0, 0, 0, 0);
+    secp256k1_scalar s, v1, v2;
+    secp256k1_ge pre_a[ECMULT_CONST_TABLE_SIZE];
+    secp256k1_ge pre_a_lam[ECMULT_CONST_TABLE_SIZE];
+    secp256k1_fe global_z;
+    int group, i;

+    /* We're allowed to be non-constant time in the point, and the code below (in particular,
+     * secp256k1_ecmult_const_odd_multiples_table_globalz) cannot deal with infinity in a
+     * constant-time manner anyway. */
    if (secp256k1_ge_is_infinity(a)) {
        secp256k1_gej_set_infinity(r);
        return;
    }

-    /* build wnaf representation for q. */
-    /* split q into q_1 and q_lam (where q = q_1 + q_lam*lambda, and q_1 and q_lam are ~128 bit) */
-    secp256k1_scalar_split_lambda(&q_1, &q_lam, scalar);
-    skew_1   = secp256k1_wnaf_const(wnaf_1,   &q_1,   WINDOW_A - 1, 128);
-    skew_lam = secp256k1_wnaf_const(wnaf_lam, &q_lam, WINDOW_A - 1, 128);
+    /* Compute v1 and v2. */
+    secp256k1_scalar_add(&s, q, &secp256k1_ecmult_const_K);
+    secp256k1_scalar_half(&s, &s);
+    secp256k1_scalar_split_lambda(&v1, &v2, &s);
+    secp256k1_scalar_add(&v1, &v1, &S_OFFSET);
+    secp256k1_scalar_add(&v2, &v2, &S_OFFSET);

-    /* Calculate odd multiples of a.
+#ifdef VERIFY
+    /* Verify that v1 and v2 are in range [0, 2^129-1]. */
+    for (i = 129; i < 256; ++i) {
+        VERIFY_CHECK(secp256k1_scalar_get_bits(&v1, i, 1) == 0);
+        VERIFY_CHECK(secp256k1_scalar_get_bits(&v2, i, 1) == 0);
+    }
+#endif
+
+    /* Calculate odd multiples of A and A*lambda.
     * All multiples are brought to the same Z 'denominator', which is stored
-     * in Z. Due to secp256k1' isomorphism we can do all operations pretending
+     * in global_z. Due to secp256k1' isomorphism we can do all operations pretending
     * that the Z coordinate was 1, use affine addition formulae, and correct
     * the Z coordinate of the result once at the end.
     */
-    VERIFY_CHECK(!a->infinity);
    secp256k1_gej_set_ge(r, a);
-    secp256k1_ecmult_odd_multiples_table_globalz_windowa(pre_a, &Z, r);
-    for (i = 0; i < ECMULT_TABLE_SIZE(WINDOW_A); i++) {
-        secp256k1_fe_normalize_weak(&pre_a[i].y);
-    }
-    for (i = 0; i < ECMULT_TABLE_SIZE(WINDOW_A); i++) {
+    secp256k1_ecmult_const_odd_multiples_table_globalz(pre_a, &global_z, r);
+    for (i = 0; i < ECMULT_CONST_TABLE_SIZE; i++) {
        secp256k1_ge_mul_lambda(&pre_a_lam[i], &pre_a[i]);
    }

-    /* first loop iteration (separated out so we can directly set r, rather
-     * than having it start at infinity, get doubled several times, then have
-     * its new value added to it) */
-    i = wnaf_1[WNAF_SIZE_BITS(128, WINDOW_A - 1)];
-    VERIFY_CHECK(i != 0);
-    ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a, i, WINDOW_A);
-    secp256k1_gej_set_ge(r, &tmpa);
-    i = wnaf_lam[WNAF_SIZE_BITS(128, WINDOW_A - 1)];
-    VERIFY_CHECK(i != 0);
-    ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a_lam, i, WINDOW_A);
-    secp256k1_gej_add_ge(r, r, &tmpa);
-    /* remaining loop iterations */
-    for (i = WNAF_SIZE_BITS(128, WINDOW_A - 1) - 1; i >= 0; i--) {
-        int n;
+    /* Next, we compute r = C_l(v1, A) + C_l(v2, lambda*A).
+     *
+     * We proceed in groups of ECMULT_CONST_GROUP_SIZE bits, operating on that many bits
+     * at a time, from high in v1, v2 to low. Call these bits1 (from v1) and bits2 (from v2).
+     *
+     * Now note that ECMULT_CONST_TABLE_GET_GE(&t, pre_a, bits1) loads into t a point equal
+     * to C_{ECMULT_CONST_GROUP_SIZE}(bits1, A), and analogously for pre_lam_a / bits2.
+     * This means that all we need to do is add these looked up values together, multiplied
+     * by 2^(ECMULT_GROUP_SIZE * group).
+     */
+    for (group = ECMULT_CONST_GROUPS - 1; group >= 0; --group) {
+        /* Using the _var get_bits function is ok here, since it's only variable in offset and count, not in the scalar. */
+        unsigned int bits1 = secp256k1_scalar_get_bits_var(&v1, group * ECMULT_CONST_GROUP_SIZE, ECMULT_CONST_GROUP_SIZE);
+        unsigned int bits2 = secp256k1_scalar_get_bits_var(&v2, group * ECMULT_CONST_GROUP_SIZE, ECMULT_CONST_GROUP_SIZE);
+        secp256k1_ge t;
        int j;
-        for (j = 0; j < WINDOW_A - 1; ++j) {
+
+        ECMULT_CONST_TABLE_GET_GE(&t, pre_a, bits1);
+        if (group == ECMULT_CONST_GROUPS - 1) {
+            /* Directly set r in the first iteration. */
+            secp256k1_gej_set_ge(r, &t);
+        } else {
+            /* Shift the result so far up. */
+            for (j = 0; j < ECMULT_CONST_GROUP_SIZE; ++j) {
                secp256k1_gej_double(r, r);
            }
-
-        n = wnaf_1[i];
-        ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a, n, WINDOW_A);
-        VERIFY_CHECK(n != 0);
-        secp256k1_gej_add_ge(r, r, &tmpa);
-        n = wnaf_lam[i];
-        ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a_lam, n, WINDOW_A);
-        VERIFY_CHECK(n != 0);
-        secp256k1_gej_add_ge(r, r, &tmpa);
+            secp256k1_gej_add_ge(r, r, &t);
+        }
+        ECMULT_CONST_TABLE_GET_GE(&t, pre_a_lam, bits2);
+        secp256k1_gej_add_ge(r, r, &t);
    }

-    {
-        /* Correct for wNAF skew */
-        secp256k1_gej tmpj;
-
-        secp256k1_ge_neg(&tmpa, &pre_a[0]);
-        secp256k1_gej_add_ge(&tmpj, r, &tmpa);
-        secp256k1_gej_cmov(r, &tmpj, skew_1);
-
-        secp256k1_ge_neg(&tmpa, &pre_a_lam[0]);
-        secp256k1_gej_add_ge(&tmpj, r, &tmpa);
-        secp256k1_gej_cmov(r, &tmpj, skew_lam);
-    }
-
-    secp256k1_fe_mul(&r->z, &r->z, &Z);
+    /* Map the result back to the secp256k1 curve from the isomorphic curve. */
+    secp256k1_fe_mul(&r->z, &r->z, &global_z);
 }

 static int secp256k1_ecmult_const_xonly(secp256k1_fe* r, const secp256k1_fe *n, const secp256k1_fe *d, const secp256k1_scalar *q, int known_on_curve) {
--- a/src/scalar.h
+++ b/src/scalar.h
@ -25,7 +25,7 @@ static void secp256k1_scalar_clear(secp256k1_scalar *r);
 /** Access bits from a scalar. All requested bits must belong to the same 32-bit limb. */
 static unsigned int secp256k1_scalar_get_bits(const secp256k1_scalar *a, unsigned int offset, unsigned int count);

-/** Access bits from a scalar. Not constant time. */
+/** Access bits from a scalar. Not constant time in offset and count. */
 static unsigned int secp256k1_scalar_get_bits_var(const secp256k1_scalar *a, unsigned int offset, unsigned int count);

 /** Set a scalar from a big endian byte array. The scalar will be reduced modulo group order `n`.
@ -54,10 +54,6 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int
 /** Multiply two scalars (modulo the group order). */
 static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b);

-/** Shift a scalar right by some amount strictly between 0 and 16, returning
- *  the low bits that were shifted off */
-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n);
-
 /** Compute the inverse of a scalar (modulo the group order). */
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *a);

@ -67,6 +63,9 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
 /** Compute the complement of a scalar (modulo the group order). */
 static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a);

+/** Multiply a scalar with the multiplicative inverse of 2. */
+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a);
+
 /** Check whether a scalar equals zero. */
 static int secp256k1_scalar_is_zero(const secp256k1_scalar *a);

--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@ -199,6 +199,47 @@ static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar
    secp256k1_scalar_verify(r);
 }

+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    /* Writing `/` for field division and `//` for integer division, we compute
+     *
+     *   a/2 = (a - (a&1))/2 + (a&1)/2
+     *       = (a >> 1) + (a&1 ?    1/2 : 0)
+     *       = (a >> 1) + (a&1 ? n//2+1 : 0),
+     *
+     * where n is the group order and in the last equality we have used 1/2 = n//2+1 (mod n).
+     * For n//2, we have the constants SECP256K1_N_H_0, ...
+     *
+     * This sum does not overflow. The most extreme case is a = -2, the largest odd scalar. Here:
+     * - the left summand is:  a >> 1 = (a - a&1)/2 = (n-2-1)//2           = (n-3)//2
+     * - the right summand is: a&1 ? n//2+1 : 0 = n//2+1 = (n-1)//2 + 2//2 = (n+1)//2
+     * Together they sum to (n-3)//2 + (n+1)//2 = (2n-2)//2 = n - 1, which is less than n.
+     */
+    uint64_t mask = -(uint64_t)(a->d[0] & 1U);
+    secp256k1_uint128 t;
+    secp256k1_scalar_verify(a);
+
+    secp256k1_u128_from_u64(&t, (a->d[0] >> 1) | (a->d[1] << 63));
+    secp256k1_u128_accum_u64(&t, (SECP256K1_N_H_0 + 1U) & mask);
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, (a->d[1] >> 1) | (a->d[2] << 63));
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_H_1 & mask);
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, (a->d[2] >> 1) | (a->d[3] << 63));
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_H_2 & mask);
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    r->d[3] = secp256k1_u128_to_u64(&t) + (a->d[3] >> 1) + (SECP256K1_N_H_3 & mask);
+#ifdef VERIFY
+    /* The line above only computed the bottom 64 bits of r->d[3]; redo the computation
+     * in full 128 bits to make sure the top 64 bits are indeed zero. */
+    secp256k1_u128_accum_u64(&t, a->d[3] >> 1);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_H_3 & mask);
+    secp256k1_u128_rshift(&t, 64);
+    VERIFY_CHECK(secp256k1_u128_to_u64(&t) == 0);
+
+    secp256k1_scalar_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
    secp256k1_scalar_verify(a);

@ -809,22 +850,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
    secp256k1_scalar_verify(r);
 }

-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
-    int ret;
-    secp256k1_scalar_verify(r);
-    VERIFY_CHECK(n > 0);
-    VERIFY_CHECK(n < 16);
-
-    ret = r->d[0] & ((1 << n) - 1);
-    r->d[0] = (r->d[0] >> n) + (r->d[1] << (64 - n));
-    r->d[1] = (r->d[1] >> n) + (r->d[2] << (64 - n));
-    r->d[2] = (r->d[2] >> n) + (r->d[3] << (64 - n));
-    r->d[3] = (r->d[3] >> n);
-
-    secp256k1_scalar_verify(r);
-    return ret;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
    secp256k1_scalar_verify(k);

--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@ -245,6 +245,55 @@ static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar
    secp256k1_scalar_verify(r);
 }

+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    /* Writing `/` for field division and `//` for integer division, we compute
+     *
+     *   a/2 = (a - (a&1))/2 + (a&1)/2
+     *       = (a >> 1) + (a&1 ?    1/2 : 0)
+     *       = (a >> 1) + (a&1 ? n//2+1 : 0),
+     *
+     * where n is the group order and in the last equality we have used 1/2 = n//2+1 (mod n).
+     * For n//2, we have the constants SECP256K1_N_H_0, ...
+     *
+     * This sum does not overflow. The most extreme case is a = -2, the largest odd scalar. Here:
+     * - the left summand is:  a >> 1 = (a - a&1)/2 = (n-2-1)//2           = (n-3)//2
+     * - the right summand is: a&1 ? n//2+1 : 0 = n//2+1 = (n-1)//2 + 2//2 = (n+1)//2
+     * Together they sum to (n-3)//2 + (n+1)//2 = (2n-2)//2 = n - 1, which is less than n.
+     */
+    uint32_t mask = -(uint32_t)(a->d[0] & 1U);
+    uint64_t t = (uint32_t)((a->d[0] >> 1) | (a->d[1] << 31));
+    secp256k1_scalar_verify(a);
+
+    t += (SECP256K1_N_H_0 + 1U) & mask;
+    r->d[0] = t; t >>= 32;
+    t += (uint32_t)((a->d[1] >> 1) | (a->d[2] << 31));
+    t += SECP256K1_N_H_1 & mask;
+    r->d[1] = t; t >>= 32;
+    t += (uint32_t)((a->d[2] >> 1) | (a->d[3] << 31));
+    t += SECP256K1_N_H_2 & mask;
+    r->d[2] = t; t >>= 32;
+    t += (uint32_t)((a->d[3] >> 1) | (a->d[4] << 31));
+    t += SECP256K1_N_H_3 & mask;
+    r->d[3] = t; t >>= 32;
+    t += (uint32_t)((a->d[4] >> 1) | (a->d[5] << 31));
+    t += SECP256K1_N_H_4 & mask;
+    r->d[4] = t; t >>= 32;
+    t += (uint32_t)((a->d[5] >> 1) | (a->d[6] << 31));
+    t += SECP256K1_N_H_5 & mask;
+    r->d[5] = t; t >>= 32;
+    t += (uint32_t)((a->d[6] >> 1) | (a->d[7] << 31));
+    t += SECP256K1_N_H_6 & mask;
+    r->d[6] = t; t >>= 32;
+    r->d[7] = (uint32_t)t + (uint32_t)(a->d[7] >> 1) + (SECP256K1_N_H_7 & mask);
+#ifdef VERIFY
+    /* The line above only computed the bottom 32 bits of r->d[7]. Redo the computation
+     * in full 64 bits to make sure the top 32 bits are indeed zero. */
+    VERIFY_CHECK((t + (a->d[7] >> 1) + (SECP256K1_N_H_7 & mask)) >> 32 == 0);
+
+    secp256k1_scalar_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
    secp256k1_scalar_verify(a);

@ -613,26 +662,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
    secp256k1_scalar_verify(r);
 }

-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
-    int ret;
-    secp256k1_scalar_verify(r);
-    VERIFY_CHECK(n > 0);
-    VERIFY_CHECK(n < 16);
-
-    ret = r->d[0] & ((1 << n) - 1);
-    r->d[0] = (r->d[0] >> n) + (r->d[1] << (32 - n));
-    r->d[1] = (r->d[1] >> n) + (r->d[2] << (32 - n));
-    r->d[2] = (r->d[2] >> n) + (r->d[3] << (32 - n));
-    r->d[3] = (r->d[3] >> n) + (r->d[4] << (32 - n));
-    r->d[4] = (r->d[4] >> n) + (r->d[5] << (32 - n));
-    r->d[5] = (r->d[5] >> n) + (r->d[6] << (32 - n));
-    r->d[6] = (r->d[6] >> n) + (r->d[7] << (32 - n));
-    r->d[7] = (r->d[7] >> n);
-
-    secp256k1_scalar_verify(r);
-    return ret;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
    secp256k1_scalar_verify(k);

--- a/src/scalar_low.h
+++ b/src/scalar_low.h
@ -1,5 +1,5 @@
 /***********************************************************************
- * Copyright (c) 2015 Andrew Poelstra                                  *
+ * Copyright (c) 2015, 2022 Andrew Poelstra, Pieter Wuille             *
 * Distributed under the MIT software license, see the accompanying    *
 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
 ***********************************************************************/
@ -12,6 +12,13 @@
 /** A scalar modulo the group order of the secp256k1 curve. */
 typedef uint32_t secp256k1_scalar;

-#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) (d0)
+/* A compile-time constant equal to 2^32 (modulo order). */
+#define SCALAR_2P32 ((0xffffffffUL % EXHAUSTIVE_TEST_ORDER) + 1U)
+
+/* Compute a*2^32 + b (modulo order). */
+#define SCALAR_HORNER(a, b) (((uint64_t)(a) * SCALAR_2P32 + (b)) % EXHAUSTIVE_TEST_ORDER)
+
+/* Evaluates to the provided 256-bit constant reduced modulo order. */
+#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER((d7), (d6)), (d5)), (d4)), (d3)), (d2)), (d1)), (d0))

 #endif /* SECP256K1_SCALAR_REPR_H */
--- a/src/scalar_low_impl.h
+++ b/src/scalar_low_impl.h
@ -139,19 +139,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
    secp256k1_scalar_verify(r);
 }

-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
-    int ret;
-    secp256k1_scalar_verify(r);
-    VERIFY_CHECK(n > 0);
-    VERIFY_CHECK(n < 16);
-
-    ret = *r & ((1 << n) - 1);
-    *r >>= n;
-
-    secp256k1_scalar_verify(r);
-    return ret;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *a) {
    secp256k1_scalar_verify(a);

@ -205,4 +192,12 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
    secp256k1_scalar_verify(r);
 }

+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    secp256k1_scalar_verify(a);
+
+    *r = (*a + ((-(uint32_t)(*a & 1)) & EXHAUSTIVE_TEST_ORDER)) >> 1;
+
+    secp256k1_scalar_verify(r);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
--- a/src/tests.c
+++ b/src/tests.c
@ -2180,20 +2180,6 @@ static void scalar_test(void) {
        CHECK(secp256k1_scalar_eq(&n, &s));
    }

-    {
-        /* test secp256k1_scalar_shr_int */
-        secp256k1_scalar r;
-        int i;
-        random_scalar_order_test(&r);
-        for (i = 0; i < 100; ++i) {
-            int low;
-            int shift = 1 + secp256k1_testrand_int(15);
-            int expected = r.d[0] % (1ULL << shift);
-            low = secp256k1_scalar_shr_int(&r, shift);
-            CHECK(expected == low);
-        }
-    }
-
    {
        /* Test commutativity of add. */
        secp256k1_scalar r1, r2;
@ -2285,6 +2271,13 @@ static void scalar_test(void) {
        CHECK(secp256k1_scalar_eq(&r1, &secp256k1_scalar_zero));
    }

+    {
+        /* Test halving. */
+        secp256k1_scalar r;
+        secp256k1_scalar_add(&r, &s, &s);
+        secp256k1_scalar_half(&r, &r);
+        CHECK(secp256k1_scalar_eq(&r, &s));
+    }
 }

 static void run_scalar_set_b32_seckey_tests(void) {
@ -2337,6 +2330,38 @@ static void run_scalar_tests(void) {
        CHECK(secp256k1_scalar_is_zero(&o));
    }

+    {
+        /* Test that halving and doubling roundtrips on some fixed values. */
+        static const secp256k1_scalar HALF_TESTS[] = {
+            /* 0 */
+            SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0),
+            /* 1 */
+            SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1),
+            /* -1 */
+            SECP256K1_SCALAR_CONST(0xfffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffeul, 0xbaaedce6ul, 0xaf48a03bul, 0xbfd25e8cul, 0xd0364140ul),
+            /* -2 (largest odd value) */
+            SECP256K1_SCALAR_CONST(0xfffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffeul, 0xbaaedce6ul, 0xaf48a03bul, 0xbfd25e8cul, 0xd036413Ful),
+            /* Half the secp256k1 order */
+            SECP256K1_SCALAR_CONST(0x7ffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffful, 0x5d576e73ul, 0x57a4501dul, 0xdfe92f46ul, 0x681b20a0ul),
+            /* Half the secp256k1 order + 1 */
+            SECP256K1_SCALAR_CONST(0x7ffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffful, 0x5d576e73ul, 0x57a4501dul, 0xdfe92f46ul, 0x681b20a1ul),
+            /* 2^255 */
+            SECP256K1_SCALAR_CONST(0x80000000ul, 0, 0, 0, 0, 0, 0, 0),
+            /* 2^255 - 1 */
+            SECP256K1_SCALAR_CONST(0x7ffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffful, 0xfffffffful),
+        };
+        unsigned n;
+        for (n = 0; n < sizeof(HALF_TESTS) / sizeof(HALF_TESTS[0]); ++n) {
+            secp256k1_scalar s;
+            secp256k1_scalar_half(&s, &HALF_TESTS[n]);
+            secp256k1_scalar_add(&s, &s, &s);
+            CHECK(secp256k1_scalar_eq(&s, &HALF_TESTS[n]));
+            secp256k1_scalar_add(&s, &s, &s);
+            secp256k1_scalar_half(&s, &s);
+            CHECK(secp256k1_scalar_eq(&s, &HALF_TESTS[n]));
+        }
+    }
+
    {
        /* Does check_overflow check catch all ones? */
        static const secp256k1_scalar overflowed = SECP256K1_SCALAR_CONST(
@ -4424,25 +4449,74 @@ static void ecmult_const_commutativity(void) {
 }

 static void ecmult_const_mult_zero_one(void) {
+    secp256k1_scalar s;
    secp256k1_scalar negone;
    secp256k1_gej res1;
    secp256k1_ge res2;
    secp256k1_ge point;
-    secp256k1_scalar_negate(&negone, &secp256k1_scalar_one);
+    secp256k1_ge inf;

+    random_scalar_order_test(&s);
+    secp256k1_scalar_negate(&negone, &secp256k1_scalar_one);
    random_group_element_test(&point);
+    secp256k1_ge_set_infinity(&inf);
+
+    /* 0*point */
    secp256k1_ecmult_const(&res1, &point, &secp256k1_scalar_zero);
-    secp256k1_ge_set_gej(&res2, &res1);
-    CHECK(secp256k1_ge_is_infinity(&res2));
+    CHECK(secp256k1_gej_is_infinity(&res1));
+
+    /* s*inf */
+    secp256k1_ecmult_const(&res1, &inf, &s);
+    CHECK(secp256k1_gej_is_infinity(&res1));
+
+    /* 1*point */
    secp256k1_ecmult_const(&res1, &point, &secp256k1_scalar_one);
    secp256k1_ge_set_gej(&res2, &res1);
    ge_equals_ge(&res2, &point);
+
+    /* -1*point */
    secp256k1_ecmult_const(&res1, &point, &negone);
    secp256k1_gej_neg(&res1, &res1);
    secp256k1_ge_set_gej(&res2, &res1);
    ge_equals_ge(&res2, &point);
 }

+static void ecmult_const_check_result(const secp256k1_ge *A, const secp256k1_scalar* q, const secp256k1_gej *res) {
+    secp256k1_gej pointj, res2j;
+    secp256k1_ge res2;
+    secp256k1_gej_set_ge(&pointj, A);
+    secp256k1_ecmult(&res2j, &pointj, q, &secp256k1_scalar_zero);
+    secp256k1_ge_set_gej(&res2, &res2j);
+    ge_equals_gej(&res2, res);
+}
+
+static void ecmult_const_edges(void) {
+    secp256k1_scalar q;
+    secp256k1_ge point;
+    secp256k1_gej res;
+    size_t i;
+    size_t cases = 1 + sizeof(scalars_near_split_bounds) / sizeof(scalars_near_split_bounds[0]);
+
+    /* We are trying to reach the following edge cases (variables are defined as
+     * in ecmult_const_impl.h):
+     *   1. i = 0: s = 0 <=> q = -K
+     *   2. i > 0: v1, v2 large values
+     *               <=> s1, s2 large values
+     *               <=> s = scalars_near_split_bounds[i]
+     *               <=> q = 2*scalars_near_split_bounds[i] - K
+     */
+    for (i = 0; i < cases; ++i) {
+        secp256k1_scalar_negate(&q, &secp256k1_ecmult_const_K);
+        if (i > 0) {
+            secp256k1_scalar_add(&q, &q, &scalars_near_split_bounds[i - 1]);
+            secp256k1_scalar_add(&q, &q, &scalars_near_split_bounds[i - 1]);
+        }
+        random_group_element_test(&point);
+        secp256k1_ecmult_const(&res, &point, &q);
+        ecmult_const_check_result(&point, &q, &res);
+    }
+}
+
 static void ecmult_const_mult_xonly(void) {
    int i;

@ -4526,6 +4600,7 @@ static void ecmult_const_chain_multiply(void) {

 static void run_ecmult_const_tests(void) {
    ecmult_const_mult_zero_one();
+    ecmult_const_edges();
    ecmult_const_random_mult();
    ecmult_const_commutativity();
    ecmult_const_chain_multiply();
@ -5186,73 +5261,17 @@ static void test_wnaf(const secp256k1_scalar *number, int w) {
    CHECK(secp256k1_scalar_eq(&x, number)); /* check that wnaf represents number */
 }

-static void test_constant_wnaf_negate(const secp256k1_scalar *number) {
-    secp256k1_scalar neg1 = *number;
-    secp256k1_scalar neg2 = *number;
-    int sign1 = 1;
-    int sign2 = 1;
-
-    if (!secp256k1_scalar_get_bits(&neg1, 0, 1)) {
-        secp256k1_scalar_negate(&neg1, &neg1);
-        sign1 = -1;
-    }
-    sign2 = secp256k1_scalar_cond_negate(&neg2, secp256k1_scalar_is_even(&neg2));
-    CHECK(sign1 == sign2);
-    CHECK(secp256k1_scalar_eq(&neg1, &neg2));
-}
-
-static void test_constant_wnaf(const secp256k1_scalar *number, int w) {
-    secp256k1_scalar x, shift;
-    int wnaf[256] = {0};
-    int i;
-    int skew;
-    int bits = 256;
-    secp256k1_scalar num = *number;
-    secp256k1_scalar scalar_skew;
-
-    secp256k1_scalar_set_int(&x, 0);
-    secp256k1_scalar_set_int(&shift, 1 << w);
-    for (i = 0; i < 16; ++i) {
-        secp256k1_scalar_shr_int(&num, 8);
-    }
-    bits = 128;
-    skew = secp256k1_wnaf_const(wnaf, &num, w, bits);
-
-    for (i = WNAF_SIZE_BITS(bits, w); i >= 0; --i) {
-        secp256k1_scalar t;
-        int v = wnaf[i];
-        CHECK(v != 0); /* check nonzero */
-        CHECK(v & 1);  /* check parity */
-        CHECK(v > -(1 << w)); /* check range above */
-        CHECK(v < (1 << w));  /* check range below */
-
-        secp256k1_scalar_mul(&x, &x, &shift);
-        if (v >= 0) {
-            secp256k1_scalar_set_int(&t, v);
-        } else {
-            secp256k1_scalar_set_int(&t, -v);
-            secp256k1_scalar_negate(&t, &t);
-        }
-        secp256k1_scalar_add(&x, &x, &t);
-    }
-    /* Skew num because when encoding numbers as odd we use an offset */
-    secp256k1_scalar_set_int(&scalar_skew, skew);
-    secp256k1_scalar_add(&num, &num, &scalar_skew);
-    CHECK(secp256k1_scalar_eq(&x, &num));
-}
-
 static void test_fixed_wnaf(const secp256k1_scalar *number, int w) {
    secp256k1_scalar x, shift;
    int wnaf[256] = {0};
    int i;
    int skew;
-    secp256k1_scalar num = *number;
+    secp256k1_scalar num, unused;

    secp256k1_scalar_set_int(&x, 0);
    secp256k1_scalar_set_int(&shift, 1 << w);
-    for (i = 0; i < 16; ++i) {
-        secp256k1_scalar_shr_int(&num, 8);
-    }
+    /* Make num a 128-bit scalar. */
+    secp256k1_scalar_split_128(&num, &unused, number);
    skew = secp256k1_wnaf_fixed(wnaf, &num, w);

    for (i = WNAF_SIZE(w)-1; i >= 0; --i) {
@ -5344,32 +5363,7 @@ static void test_fixed_wnaf_small(void) {

 static void run_wnaf(void) {
    int i;
-    secp256k1_scalar n = {{0}};
-
-    test_constant_wnaf(&n, 4);
-    /* Sanity check: 1 and 2 are the smallest odd and even numbers and should
-     *               have easier-to-diagnose failure modes  */
-    n.d[0] = 1;
-    test_constant_wnaf(&n, 4);
-    n.d[0] = 2;
-    test_constant_wnaf(&n, 4);
-    /* Test -1, because it's a special case in wnaf_const */
-    n = secp256k1_scalar_one;
-    secp256k1_scalar_negate(&n, &n);
-    test_constant_wnaf(&n, 4);
-
-    /* Test -2, which may not lead to overflows in wnaf_const */
-    secp256k1_scalar_add(&n, &secp256k1_scalar_one, &secp256k1_scalar_one);
-    secp256k1_scalar_negate(&n, &n);
-    test_constant_wnaf(&n, 4);
-
-    /* Test (1/2) - 1 = 1/-2 and 1/2 = (1/-2) + 1
-       as corner cases of negation handling in wnaf_const */
-    secp256k1_scalar_inverse(&n, &n);
-    test_constant_wnaf(&n, 4);
-
-    secp256k1_scalar_add(&n, &n, &secp256k1_scalar_one);
-    test_constant_wnaf(&n, 4);
+    secp256k1_scalar n;

    /* Test 0 for fixed wnaf */
    test_fixed_wnaf_small();
@ -5377,8 +5371,6 @@ static void run_wnaf(void) {
    for (i = 0; i < COUNT; i++) {
        random_scalar_order(&n);
        test_wnaf(&n, 4+(i%10));
-        test_constant_wnaf_negate(&n);
-        test_constant_wnaf(&n, 4 + (i % 10));
        test_fixed_wnaf(&n, 4 + (i % 10));
    }
    secp256k1_scalar_set_int(&n, 0);