Merge bitcoin-core/secp256k1#810: Avoid overly-wide multiplications in 5x52 field mul/sqr

b53e0cd61fce0bcef178f317537c91efc9afd04d Avoid overly-wide multiplications (Peter Dettman)

Pull request description:

  Speeds up bench_ecdh, bench_sign, bench_verify relative to master by 5+% at -O3, haswell.

ACKs for top commit:
  sipa:
    ACK b53e0cd61fce0bcef178f317537c91efc9afd04d
  real-or-random:
    ACK b53e0cd61fce0bcef178f317537c91efc9afd04d I've inspected the diff and run the tests without asm for a CPU day

Tree-SHA512: 4f79c98371a3dc9da013632210c8db979f910b222291999dfaa0c31849a77eb427361e4ab9206cbfee73c30a8933178784d6cb8e747e8dca6b227eb77fbea2a2
This commit is contained in:
Tim Ruffing 2021-10-17 17:06:37 +02:00
commit 9526874d14
No known key found for this signature in database
GPG Key ID: 8C461CCD293F6011

View File

@ -49,14 +49,14 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
c = (uint128_t)a4 * b[4];
VERIFY_BITS(c, 112);
/* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
d += (c & M) * R; c >>= 52;
d += (uint128_t)R * (uint64_t)c; c >>= 64;
VERIFY_BITS(d, 115);
VERIFY_BITS(c, 60);
/* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
VERIFY_BITS(c, 48);
/* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
t3 = d & M; d >>= 52;
VERIFY_BITS(t3, 52);
VERIFY_BITS(d, 63);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
d += (uint128_t)a0 * b[4]
+ (uint128_t)a1 * b[3]
@ -64,8 +64,8 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
+ (uint128_t)a3 * b[1]
+ (uint128_t)a4 * b[0];
VERIFY_BITS(d, 115);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += c * R;
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += (uint128_t)(R << 12) * (uint64_t)c;
VERIFY_BITS(d, 116);
/* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
t4 = d & M; d >>= 52;
@ -129,17 +129,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
+ (uint128_t)a4 * b[3];
VERIFY_BITS(d, 114);
/* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += (d & M) * R; d >>= 52;
c += (uint128_t)R * (uint64_t)d; d >>= 64;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
/* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
VERIFY_BITS(d, 50);
/* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
/* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[2] = c & M; c >>= 52;
VERIFY_BITS(r[2], 52);
VERIFY_BITS(c, 63);
/* [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += d * R + t3;
/* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += (uint128_t)(R << 12) * (uint64_t)d + t3;
VERIFY_BITS(c, 100);
/* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[3] = c & M; c >>= 52;
@ -178,22 +177,22 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
c = (uint128_t)a4 * a4;
VERIFY_BITS(c, 112);
/* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
d += (c & M) * R; c >>= 52;
d += (uint128_t)R * (uint64_t)c; c >>= 64;
VERIFY_BITS(d, 115);
VERIFY_BITS(c, 60);
/* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
VERIFY_BITS(c, 48);
/* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
t3 = d & M; d >>= 52;
VERIFY_BITS(t3, 52);
VERIFY_BITS(d, 63);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
a4 *= 2;
d += (uint128_t)a0 * a4
+ (uint128_t)(a1*2) * a3
+ (uint128_t)a2 * a2;
VERIFY_BITS(d, 115);
/* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += c * R;
/* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
d += (uint128_t)(R << 12) * (uint64_t)c;
VERIFY_BITS(d, 116);
/* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
t4 = d & M; d >>= 52;
@ -252,16 +251,16 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
d += (uint128_t)a3 * a4;
VERIFY_BITS(d, 114);
/* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += (d & M) * R; d >>= 52;
c += (uint128_t)R * (uint64_t)d; d >>= 64;
VERIFY_BITS(c, 115);
VERIFY_BITS(d, 62);
/* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
VERIFY_BITS(d, 50);
/* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[2] = c & M; c >>= 52;
VERIFY_BITS(r[2], 52);
VERIFY_BITS(c, 63);
/* [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
/* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
c += d * R + t3;
c += (uint128_t)(R << 12) * (uint64_t)d + t3;
VERIFY_BITS(c, 100);
/* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
r[3] = c & M; c >>= 52;