Improve normalization performance for 32bit

- Uses a similar approach to the latest 64bit _normalize. - Add one useful optimization back into the 64bit _normalize too. Performance of 'bench' improved by around 0.5% for the 32bit field (but tested on a 64-bit machine).
2014-06-23 12:12:58 +07:00 · 2014-06-23 12:12:58 +07:00 · 42822baaa8
commit 42822baaa8
parent f33793fb99
2 changed files with 47 additions and 72 deletions
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@ -15,77 +15,52 @@ void static secp256k1_fe_inner_start(void) {}
 void static secp256k1_fe_inner_stop(void) {}

 void static secp256k1_fe_normalize(secp256k1_fe_t *r) {
-//    fog("normalize in: ", r);
-    uint32_t c;
-    c = r->n[0];
-    uint32_t t0 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[1];
-    uint32_t t1 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[2];
-    uint32_t t2 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[3];
-    uint32_t t3 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[4];
-    uint32_t t4 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[5];
-    uint32_t t5 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[6];
-    uint32_t t6 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[7];
-    uint32_t t7 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[8];
-    uint32_t t8 = c & 0x3FFFFFFUL;
-    c = (c >> 26) + r->n[9];
-    uint32_t t9 = c & 0x03FFFFFUL;
-    c >>= 22;
-/*    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
-    r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
-    fog("         tm1: ", r);
-    fprintf(stderr, "out c= %08lx\n", (unsigned long)c);*/
+    uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
+             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];

-    // The following code will not modify the t's if c is initially 0.
-    uint32_t d = c * 0x3D1UL + t0;
-    t0 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t1 + c*0x40;
-    t1 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t2;
-    t2 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t3;
-    t3 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t4;
-    t4 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t5;
-    t5 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t6;
-    t6 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t7;
-    t7 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t8;
-    t8 = d & 0x3FFFFFFULL;
-    d = (d >> 26) + t9;
-    t9 = d & 0x03FFFFFULL;
-    assert((d >> 22) == 0);
-/*    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
-    r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
-    fog("         tm2: ", r); */
+    // Reduce t9 at the start so there will be at most a single carry from the first pass
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
+    uint32_t m;

-    // Subtract p if result >= p
-    uint64_t low = ((uint64_t)t1 << 26) | t0;
-    uint64_t mask = -(int64_t)((t9 < 0x03FFFFFUL) | (t8 < 0x3FFFFFFUL) | (t7 < 0x3FFFFFFUL) | (t6 < 0x3FFFFFFUL) | (t5 < 0x3FFFFFFUL) | (t4 < 0x3FFFFFFUL) | (t3 < 0x3FFFFFFUL) | (t2 < 0x3FFFFFFUL) | (low < 0xFFFFEFFFFFC2FULL));
-    t9 &= mask;
-    t8 &= mask;
-    t7 &= mask;
-    t6 &= mask;
-    t5 &= mask;
-    t4 &= mask;
-    t3 &= mask;
-    t2 &= mask;
-    low -= (~mask & 0xFFFFEFFFFFC2FULL);
+    // The first pass ensures the magnitude is 1, ...
+    t0 += x * 0x3D1UL; t1 += (x << 6);
+    t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
+    t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
+    t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; m = t2;
+    t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; m &= t3;
+    t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; m &= t4;
+    t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; m &= t5;
+    t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; m &= t6;
+    t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; m &= t7;
+    t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; m &= t8;

-    // push internal variables back
-    r->n[0] = low & 0x3FFFFFFUL; r->n[1] = (low >> 26) & 0x3FFFFFFUL; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
+    // ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element)
+    assert(t9 >> 23 == 0);
+
+    // At most a single final reduction is needed; check if the value is >= the field characteristic
+    x = (t9 >> 22) | ((t9 == 0x03FFFFFULL) & (m == 0x3FFFFFFULL)
+        & ((t1 + 0x40UL + ((t0 + 0x3D1UL) >> 26)) > 0x3FFFFFFULL));
+
+    // Apply the final reduction (for constant-time behaviour, we do it always)
+    t0 += x * 0x3D1UL; t1 += (x << 6);
+    t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
+    t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
+    t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL;
+    t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL;
+    t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL;
+    t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL;
+    t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL;
+    t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL;
+    t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL;
+
+    // If t9 didn't carry to bit 22 already, then it should have after any final reduction
+    assert(t9 >> 22 == x);
+
+    // Mask off the possible multiple of 2^256 from the final reduction
+    t9 &= 0x03FFFFFUL;
+
+    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
    r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
-/*    fog("         out: ", r);*/

 #ifdef VERIFY
    r->magnitude = 1;
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@ -38,20 +38,20 @@ void static secp256k1_fe_normalize(secp256k1_fe_t *r) {

    // Reduce t4 at the start so there will be at most a single carry from the first pass
    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
+    uint64_t m;

    // The first pass ensures the magnitude is 1, ...
    t0 += x * 0x1000003D1ULL;
    t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
-    t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL;
-    t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL;
-    t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL;
+    t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1;
+    t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2;
+    t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3;

    // ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element)
    assert(t4 >> 49 == 0);

    // At most a single final reduction is needed; check if the value is >= the field characteristic
-    x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL)
-        & ((t3 & t2 & t1) == 0xFFFFFFFFFFFFFULL)
+    x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL)
        & (t0 >= 0xFFFFEFFFFFC2FULL));

    // Apply the final reduction (for constant-time behaviour, we do it always)