Improve normalization performance for 32bit
- Uses a similar approach to the latest 64bit _normalize. - Add one useful optimization back into the 64bit _normalize too. Performance of 'bench' improved by around 0.5% for the 32bit field (but tested on a 64-bit machine).
This commit is contained in:
		
							parent
							
								
									f33793fb99
								
							
						
					
					
						commit
						42822baaa8
					
				| @ -15,77 +15,52 @@ void static secp256k1_fe_inner_start(void) {} | ||||
| void static secp256k1_fe_inner_stop(void) {} | ||||
| 
 | ||||
| void static secp256k1_fe_normalize(secp256k1_fe_t *r) { | ||||
| //    fog("normalize in: ", r);
 | ||||
|     uint32_t c; | ||||
|     c = r->n[0]; | ||||
|     uint32_t t0 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[1]; | ||||
|     uint32_t t1 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[2]; | ||||
|     uint32_t t2 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[3]; | ||||
|     uint32_t t3 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[4]; | ||||
|     uint32_t t4 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[5]; | ||||
|     uint32_t t5 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[6]; | ||||
|     uint32_t t6 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[7]; | ||||
|     uint32_t t7 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[8]; | ||||
|     uint32_t t8 = c & 0x3FFFFFFUL; | ||||
|     c = (c >> 26) + r->n[9]; | ||||
|     uint32_t t9 = c & 0x03FFFFFUL; | ||||
|     c >>= 22; | ||||
| /*    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
 | ||||
|     r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9; | ||||
|     fog("         tm1: ", r); | ||||
|     fprintf(stderr, "out c= %08lx\n", (unsigned long)c);*/ | ||||
|     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4], | ||||
|              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9]; | ||||
| 
 | ||||
|     // The following code will not modify the t's if c is initially 0.
 | ||||
|     uint32_t d = c * 0x3D1UL + t0; | ||||
|     t0 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t1 + c*0x40; | ||||
|     t1 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t2; | ||||
|     t2 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t3; | ||||
|     t3 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t4; | ||||
|     t4 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t5; | ||||
|     t5 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t6; | ||||
|     t6 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t7; | ||||
|     t7 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t8; | ||||
|     t8 = d & 0x3FFFFFFULL; | ||||
|     d = (d >> 26) + t9; | ||||
|     t9 = d & 0x03FFFFFULL; | ||||
|     assert((d >> 22) == 0); | ||||
| /*    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
 | ||||
|     r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9; | ||||
|     fog("         tm2: ", r); */ | ||||
|     // Reduce t9 at the start so there will be at most a single carry from the first pass
 | ||||
|     uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL; | ||||
|     uint32_t m; | ||||
| 
 | ||||
|     // Subtract p if result >= p
 | ||||
|     uint64_t low = ((uint64_t)t1 << 26) | t0; | ||||
|     uint64_t mask = -(int64_t)((t9 < 0x03FFFFFUL) | (t8 < 0x3FFFFFFUL) | (t7 < 0x3FFFFFFUL) | (t6 < 0x3FFFFFFUL) | (t5 < 0x3FFFFFFUL) | (t4 < 0x3FFFFFFUL) | (t3 < 0x3FFFFFFUL) | (t2 < 0x3FFFFFFUL) | (low < 0xFFFFEFFFFFC2FULL)); | ||||
|     t9 &= mask; | ||||
|     t8 &= mask; | ||||
|     t7 &= mask; | ||||
|     t6 &= mask; | ||||
|     t5 &= mask; | ||||
|     t4 &= mask; | ||||
|     t3 &= mask; | ||||
|     t2 &= mask; | ||||
|     low -= (~mask & 0xFFFFEFFFFFC2FULL); | ||||
|     // The first pass ensures the magnitude is 1, ...
 | ||||
|     t0 += x * 0x3D1UL; t1 += (x << 6); | ||||
|     t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL; | ||||
|     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL; | ||||
|     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; m = t2; | ||||
|     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; m &= t3; | ||||
|     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; m &= t4; | ||||
|     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; m &= t5; | ||||
|     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; m &= t6; | ||||
|     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; m &= t7; | ||||
|     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; m &= t8; | ||||
| 
 | ||||
|     // push internal variables back
 | ||||
|     r->n[0] = low & 0x3FFFFFFUL; r->n[1] = (low >> 26) & 0x3FFFFFFUL; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4; | ||||
|     // ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element)
 | ||||
|     assert(t9 >> 23 == 0); | ||||
| 
 | ||||
|     // At most a single final reduction is needed; check if the value is >= the field characteristic
 | ||||
|     x = (t9 >> 22) | ((t9 == 0x03FFFFFULL) & (m == 0x3FFFFFFULL) | ||||
|         & ((t1 + 0x40UL + ((t0 + 0x3D1UL) >> 26)) > 0x3FFFFFFULL)); | ||||
| 
 | ||||
|     // Apply the final reduction (for constant-time behaviour, we do it always)
 | ||||
|     t0 += x * 0x3D1UL; t1 += (x << 6); | ||||
|     t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL; | ||||
|     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL; | ||||
|     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; | ||||
|     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; | ||||
|     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; | ||||
|     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; | ||||
|     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; | ||||
|     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; | ||||
|     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; | ||||
| 
 | ||||
|     // If t9 didn't carry to bit 22 already, then it should have after any final reduction
 | ||||
|     assert(t9 >> 22 == x); | ||||
| 
 | ||||
|     // Mask off the possible multiple of 2^256 from the final reduction
 | ||||
|     t9 &= 0x03FFFFFUL; | ||||
| 
 | ||||
|     r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4; | ||||
|     r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9; | ||||
| /*    fog("         out: ", r);*/ | ||||
| 
 | ||||
| #ifdef VERIFY | ||||
|     r->magnitude = 1; | ||||
|  | ||||
| @ -38,20 +38,20 @@ void static secp256k1_fe_normalize(secp256k1_fe_t *r) { | ||||
| 
 | ||||
|     // Reduce t4 at the start so there will be at most a single carry from the first pass
 | ||||
|     uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL; | ||||
|     uint64_t m; | ||||
| 
 | ||||
|     // The first pass ensures the magnitude is 1, ...
 | ||||
|     t0 += x * 0x1000003D1ULL; | ||||
|     t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; | ||||
|     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; | ||||
|     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; | ||||
|     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; | ||||
|     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1; | ||||
|     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2; | ||||
|     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3; | ||||
| 
 | ||||
|     // ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element)
 | ||||
|     assert(t4 >> 49 == 0); | ||||
| 
 | ||||
|     // At most a single final reduction is needed; check if the value is >= the field characteristic
 | ||||
|     x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) | ||||
|         & ((t3 & t2 & t1) == 0xFFFFFFFFFFFFFULL) | ||||
|     x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL) | ||||
|         & (t0 >= 0xFFFFEFFFFFC2FULL)); | ||||
| 
 | ||||
|     // Apply the final reduction (for constant-time behaviour, we do it always)
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user