Convert YASM code into inline assembly

2014-12-02 17:47:32 +01:00
parent f048615970
commit 67935050e1
6 changed files with 506 additions and 642 deletions
--- a/src/field_5x52_asm.asm
+++ b/src/field_5x52_asm.asm
@@ -1,529 +0,0 @@
-	;; Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille
-	;; Distributed under the MIT software license, see the accompanying
-	;; file COPYING or http://www.opensource.org/licenses/mit-license.php.
-
-	;; Changelog:
-	;; * March 2013, Diederik Huys: Original version
-	;; * November 2014, Pieter Wuille: Updated to use Peter Dettman's parallel
-	;;                                 multiplication algorithm
-	;;
-	;; Provided public procedures:
-	;; 	secp256k1_fe_mul_inner
-	;; 	secp256k1_fe_sqr_inner
-	;;
-	;; Needed tools: YASM (http://yasm.tortall.net)
-	;;
-	;; 
-
-	BITS 64
-
-%ifidn   __OUTPUT_FORMAT__,macho64
-%define SYM(x) _ %+ x
-%else
-%define SYM(x) x
-%endif
-
-	;;  Procedure ExSetMult
-	;;  Register Layout:
-	;;  INPUT: 	rdi	= a->n
-	;; 	   	rsi  	= b->n
-	;; 	   	rdx  	= r->a
-	;; 
-	;;  INTERNAL:	rdx:rax  = multiplication accumulator
-	;; 		r9:r8    = c
-	;;		r10:r14  = a0-a4
-	;;		rcx:rbx  = d
-	;; 		rbp	 = R
-	;; 		rdi	 = t?
-	;;		r15	 = b->n
-	;;		rsi	 = r->n
-	GLOBAL SYM(secp256k1_fe_mul_inner)
-	ALIGN 32
-SYM(secp256k1_fe_mul_inner):
-	push rbp
-	push rbx
-	push r12
-	push r13
-	push r14
-	push r15
-	mov r10,[rdi+0*8]
-	mov r11,[rdi+1*8]
-	mov r12,[rdi+2*8]
-	mov r13,[rdi+3*8]
-	mov r14,[rdi+4*8]
-	mov rbp,01000003D10h
-	mov r15,rsi
-	mov rsi,rdx
-
-	;; d += a3 * b0
-	mov rax,[r15+0*8]
-	mul r13
-	mov rbx,rax
-	mov rcx,rdx
-	;; d += a2 * b1
-	mov rax,[r15+1*8]
-	mul r12
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a1 * b2
-	mov rax,[r15+2*8]
-	mul r11
-	add rbx,rax
-	adc rcx,rdx
-	;; d = a0 * b3
-	mov rax,[r15+3*8]
-	mul r10
-	add rbx,rax
-	adc rcx,rdx
-	;; c = a4 * b4
-	mov rax,[r15+4*8]
-	mul r14
-	mov r8,rax
-	mov r9,rdx
-	;; d += (c & M) * R
-	mov rdx,0fffffffffffffh
-	and rax,rdx
-	mul rbp
-	add rbx,rax
-	adc rcx,rdx
-	;; c >>= 52 (r8 only)
-	shrd r8,r9,52
-	;; t3 (stack) = d & M
-	mov rdi,rbx
-	mov rdx,0fffffffffffffh
-	and rdi,rdx
-	push rdi
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; d += a4 * b0
-	mov rax,[r15+0*8]
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a3 * b1
-	mov rax,[r15+1*8]
-	mul r13
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a2 * b2
-	mov rax,[r15+2*8]
-	mul r12
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a1 * b3
-	mov rax,[r15+3*8]
-	mul r11
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a0 * b4
-	mov rax,[r15+4*8]
-	mul r10
-	add rbx,rax
-	adc rcx,rdx
-	;; d += c * R
-	mov rax,r8
-	mul rbp
-	add rbx,rax
-	adc rcx,rdx
-	;; t4 = d & M (rdi)
-	mov rdi,rbx
-	mov rdx,0fffffffffffffh
-	and rdi,rdx
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; tx = t4 >> 48 (rbp, overwrites R)
-	mov rbp,rdi
-	shr rbp,48
-	;; t4 &= (M >> 4) (stack)
-	mov rax,0ffffffffffffh
-	and rdi,rax
-	push rdi
-	;; c = a0 * b0
-	mov rax,[r15+0*8]
-	mul r10
-	mov r8,rax
-	mov r9,rdx
-	;; d += a4 * b1
-	mov rax,[r15+1*8]
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a3 * b2
-	mov rax,[r15+2*8]
-	mul r13
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a2 * b3
-	mov rax,[r15+3*8]
-	mul r12
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a1 * b4
-	mov rax,[r15+4*8]
-	mul r11
-	add rbx,rax
-	adc rcx,rdx
-	;; u0 = d & M (rdi)
-	mov rdi,rbx
-	mov rdx,0fffffffffffffh
-	and rdi,rdx
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; u0 = (u0 << 4) | tx (rdi)
-	shl rdi,4
-	or rdi,rbp
-	;; c += u0 * (R >> 4)
-	mov rax,01000003D1h
-	mul rdi
-	add r8,rax
-	adc r9,rdx
-	;; r[0] = c & M
-	mov rax,r8
-	mov rdx,0fffffffffffffh
-	and rax,rdx
-	mov [rsi+0*8],rax
-	;; c >>= 52
-	shrd r8,r9,52
-	mov r9,0
-	;; c += a1 * b0
-	mov rax,[r15+0*8]
-	mul r11
-	add r8,rax
-	adc r9,rdx
-	;; c += a0 * b1
-	mov rax,[r15+1*8]
-	mul r10
-	add r8,rax
-	adc r9,rdx
-	;; d += a4 * b2
-	mov rax,[r15+2*8]
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a3 * b3
-	mov rax,[r15+3*8]
-	mul r13
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a2 * b4
-	mov rax,[r15+4*8]
-	mul r12
-	add rbx,rax
-	adc rcx,rdx
-	;; restore rdp = R
-	mov rbp,01000003D10h
-	;; c += (d & M) * R
-	mov rax,rbx
-	mov rdx,0fffffffffffffh
-	and rax,rdx
-	mul rbp
-	add r8,rax
-	adc r9,rdx
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; r[1] = c & M
-	mov rax,r8
-	mov rdx,0fffffffffffffh
-	and rax,rdx
-	mov [rsi+8*1],rax
-	;; c >>= 52
-	shrd r8,r9,52
-	mov r9,0
-	;; c += a2 * b0
-	mov rax,[r15+0*8]
-	mul r12
-	add r8,rax
-	adc r9,rdx
-	;; c += a1 * b1
-	mov rax,[r15+1*8]
-	mul r11
-	add r8,rax
-	adc r9,rdx
-	;; c += a0 * b2 (last use of r10 = a0)
-	mov rax,[r15+2*8]
-	mul r10
-	add r8,rax
-	adc r9,rdx
-	;; fetch t3 (r10, overwrites a0),t4 (rdi)
-	pop rdi
-	pop r10
-	;; d += a4 * b3
-	mov rax,[r15+3*8]
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a3 * b4
-	mov rax,[r15+4*8]
-	mul r13
-	add rbx,rax
-	adc rcx,rdx
-	;; c += (d & M) * R
-	mov rax,rbx
-	mov rdx,0fffffffffffffh
-	and rax,rdx
-	mul rbp
-	add r8,rax
-	adc r9,rdx
-	;; d >>= 52 (rbx only)
-	shrd rbx,rcx,52
-	;; r[2] = c & M
-	mov rax,r8
-	mov rdx,0fffffffffffffh
-	and rax,rdx
-	mov [rsi+2*8],rax
-	;; c >>= 52
-	shrd r8,r9,52
-	mov r9,0
-	;; c += t3
-	add r8,r10
-	;; c += d * R
-	mov rax,rbx
-	mul rbp
-	add r8,rax
-	adc r9,rdx
-	;; r[3] = c & M
-	mov rax,r8
-	mov rdx,0fffffffffffffh
-	and rax,rdx
-	mov [rsi+3*8],rax
-	;; c >>= 52 (r8 only)
-	shrd r8,r9,52
-	;; c += t4 (r8 only)
-	add r8,rdi
-	;; r[4] = c
-	mov [rsi+4*8],r8
-
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop rbx
-	pop rbp
-	ret
-
-	
-	;;  PROC ExSetSquare
-	;;  Register Layout:
-	;;  INPUT: 	rdi	 = a.n
-	;; 	   	rsi  	 = r.n
-	;;  INTERNAL:	rdx:rax  = multiplication accumulator
-	;; 		r9:r8    = c
-	;;		r10:r14  = a0-a4
-	;;		rcx:rbx  = d
-	;; 		rbp	 = R
-	;; 		rdi	 = t?
-	;;		r15	 = M
-	GLOBAL SYM(secp256k1_fe_sqr_inner)
-	ALIGN 32
-SYM(secp256k1_fe_sqr_inner):
-	push rbp
-	push rbx
-	push r12
-	push r13
-	push r14
-	push r15
-	mov r10,[rdi+0*8]
-	mov r11,[rdi+1*8]
-	mov r12,[rdi+2*8]
-	mov r13,[rdi+3*8]
-	mov r14,[rdi+4*8]
-	mov rbp,01000003D10h
-	mov r15,0fffffffffffffh
-
-	;; d = (a0*2) * a3
-	lea rax,[r10*2]
-	mul r13
-	mov rbx,rax
-	mov rcx,rdx
-	;; d += (a1*2) * a2
-	lea rax,[r11*2]
-	mul r12
-	add rbx,rax
-	adc rcx,rdx
-	;; c = a4 * a4
-	mov rax,r14
-	mul r14
-	mov r8,rax
-	mov r9,rdx
-	;; d += (c & M) * R
-	and rax,r15
-	mul rbp
-	add rbx,rax
-	adc rcx,rdx
-	;; c >>= 52 (r8 only)
-	shrd r8,r9,52
-	;; t3 (stack) = d & M
-	mov rdi,rbx
-	and rdi,r15
-	push rdi
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; a4 *= 2
-	add r14,r14
-	;; d += a0 * a4
-	mov rax,r10
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; d+= (a1*2) * a3
-	lea rax,[r11*2]
-	mul r13
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a2 * a2
-	mov rax,r12
-	mul r12
-	add rbx,rax
-	adc rcx,rdx
-	;; d += c * R
-	mov rax,r8
-	mul rbp
-	add rbx,rax
-	adc rcx,rdx
-	;; t4 = d & M (rdi)
-	mov rdi,rbx
-	and rdi,r15
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; tx = t4 >> 48 (rbp, overwrites constant)
-	mov rbp,rdi
-	shr rbp,48
-	;; t4 &= (M >> 4) (stack)
-	mov rax,0ffffffffffffh
-	and rdi,rax
-	push rdi
-	;; c = a0 * a0
-	mov rax,r10
-	mul r10
-	mov r8,rax
-	mov r9,rdx
-	;; d += a1 * a4
-	mov rax,r11
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; d += (a2*2) * a3
-	lea rax,[r12*2]
-	mul r13
-	add rbx,rax
-	adc rcx,rdx
-	;; u0 = d & M (rdi)
-	mov rdi,rbx
-	and rdi,r15
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; u0 = (u0 << 4) | tx (rdi)
-	shl rdi,4
-	or rdi,rbp
-	;; c += u0 * (R >> 4)
-	mov rax,01000003D1h
-	mul rdi
-	add r8,rax
-	adc r9,rdx
-	;; r[0] = c & M
-	mov rax,r8
-	and rax,r15
-	mov [rsi+0*8],rax
-	;; c >>= 52
-	shrd r8,r9,52
-	mov r9,0
-	;; a0 *= 2
-	add r10,r10
-	;; c += a0 * a1
-	mov rax,r10
-	mul r11
-	add r8,rax
-	adc r9,rdx
-	;; d += a2 * a4
-	mov rax,r12
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; d += a3 * a3
-	mov rax,r13
-	mul r13
-	add rbx,rax
-	adc rcx,rdx
-	;; load R in rbp
-	mov rbp,01000003D10h
-	;; c += (d & M) * R
-	mov rax,rbx
-	and rax,r15
-	mul rbp
-	add r8,rax
-	adc r9,rdx
-	;; d >>= 52
-	shrd rbx,rcx,52
-	mov rcx,0
-	;; r[1] = c & M
-	mov rax,r8
-	and rax,r15
-	mov [rsi+8*1],rax
-	;; c >>= 52
-	shrd r8,r9,52
-	mov r9,0
-	;; c += a0 * a2 (last use of r10)
-	mov rax,r10
-	mul r12
-	add r8,rax
-	adc r9,rdx
-	;; fetch t3 (r10, overwrites a0),t4 (rdi)
-	pop rdi
-	pop r10
-	;; c += a1 * a1
-	mov rax,r11
-	mul r11
-	add r8,rax
-	adc r9,rdx
-	;; d += a3 * a4
-	mov rax,r13
-	mul r14
-	add rbx,rax
-	adc rcx,rdx
-	;; c += (d & M) * R
-	mov rax,rbx
-	and rax,r15
-	mul rbp
-	add r8,rax
-	adc r9,rdx
-	;; d >>= 52 (rbx only)
-	shrd rbx,rcx,52
-	;; r[2] = c & M
-	mov rax,r8
-	and rax,r15
-	mov [rsi+2*8],rax
-	;; c >>= 52
-	shrd r8,r9,52
-	mov r9,0
-	;; c += t3
-	add r8,r10
-	;; c += d * R
-	mov rax,rbx
-	mul rbp
-	add r8,rax
-	adc r9,rdx
-	;; r[3] = c & M
-	mov rax,r8
-	and rax,r15
-	mov [rsi+3*8],rax
-	;; c >>= 52 (r8 only)
-	shrd r8,r9,52
-	;; c += t4 (r8 only)
-	add r8,rdi
-	;; r[4] = c
-	mov [rsi+4*8],r8
-
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop rbx
-	pop rbp
-	ret
--- a/src/field_5x52_asm_impl.h
+++ b/src/field_5x52_asm_impl.h
@@ -1,13 +1,502 @@
 /**********************************************************************
- * Copyright (c) 2013 Pieter Wuille                                   *
+ * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille               *
 * Distributed under the MIT software license, see the accompanying   *
 * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
 **********************************************************************/

+/**
+ * Changelog:
+ * - March 2013, Diederik Huys:    original version
+ * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
+ * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
+ */
+
 #ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
 #define _SECP256K1_FIELD_INNER5X52_IMPL_H_

-void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r);
-void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r);
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ *            r9:r8   = c
+ *            r15:rcx = d
+ *            r10-r14 = a0-a4
+ *            rbx     = b
+ *            %2      = r
+ *            %0      = a / t?
+ *            rbp     = R (0x1000003d10)
+ */
+__asm__ __volatile__(
+    "pushq %%rbp\n"
+
+    "movq 0(%0),%%r10\n"
+    "movq 8(%0),%%r11\n"
+    "movq 16(%0),%%r12\n"
+    "movq 24(%0),%%r13\n"
+    "movq 32(%0),%%r14\n"
+    "movq $0x1000003d10,%%rbp\n"
+
+    /* d += a3 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "movq %%rax,%%rcx\n"
+    "movq %%rdx,%%r15\n"
+    /* d += a2 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d = a0 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c = a4 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += (c & M) * R */
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* t3 (stack) = d & M */
+    "movq %%rcx,%0\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%0\n"
+    "pushq %0\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* d += a4 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a0 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += c * R */
+    "movq %%r8,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* t4 = d & M (%0) */
+    "movq %%rcx,%0\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%0\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* tx = t4 >> 48 (%%rbp, overwrites R) */
+    "movq %0,%%rbp\n"
+    "shrq $48,%%rbp\n"
+    /* t4 &= (M >> 4) (stack) */
+    "movq $0xffffffffffff,%%rax\n"
+    "andq %%rax,%0\n"
+    "pushq %0\n"
+    /* c = a0 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += a4 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* u0 = d & M (%0) */
+    "movq %%rcx,%0\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%0\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* u0 = (u0 << 4) | tx (%0) */
+    "shlq $4,%0\n"
+    "orq %%rbp,%0\n"
+    /* c += u0 * (R >> 4) */
+    "movq $0x1000003d1,%%rax\n"
+    "mulq %0\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[0] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,0(%2)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a1 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a0 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a4 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* restore rdp = R */
+    "movq $0x1000003d10,%%rbp\n"
+    /* c += (d & M) * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* r[1] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,8(%2)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a2 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a1 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a0 * b2 (last use of %%r10 = a0) */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* fetch t3 (%%r10, overwrites a0),t4 (%0) */
+    "popq %0\n"
+    "popq %%r10\n"
+    /* d += a4 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c += (d & M) * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 (%%rcx only) */
+    "shrdq $52,%%r15,%%rcx\n"
+    /* r[2] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,16(%2)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += t3 */
+    "addq %%r10,%%r8\n"
+    /* c += d * R */
+    "movq %%rcx,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[3] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,24(%2)\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* c += t4 (%%r8 only) */
+    "addq %0,%%r8\n"
+    /* r[4] = c */
+    "movq %%r8,32(%2)\n"
+
+    "popq %%rbp\n"
+: "+S"(a)
+: "b"(b), "D"(r)
+: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}
+
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ *            r9:r8   = c
+ *            rcx:rbx = d
+ *            r10-r14 = a0-a4
+ *            r15     = M (0xfffffffffffff)
+ *            %1      = r
+ *            %0      = a / t?
+ *            rbp     = R (0x1000003d10)
+ */
+__asm__ __volatile__(
+    "pushq %%rbp\n"
+
+    "movq 0(%0),%%r10\n"
+    "movq 8(%0),%%r11\n"
+    "movq 16(%0),%%r12\n"
+    "movq 24(%0),%%r13\n"
+    "movq 32(%0),%%r14\n"
+    "movq $0x1000003d10,%%rbp\n"
+    "movq $0xfffffffffffff,%%r15\n"
+
+    /* d = (a0*2) * a3 */
+    "leaq (%%r10,%%r10,1),%%rax\n"
+    "mulq %%r13\n"
+    "movq %%rax,%%rbx\n"
+    "movq %%rdx,%%rcx\n"
+    /* d += (a1*2) * a2 */
+    "leaq (%%r11,%%r11,1),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c = a4 * a4 */
+    "movq %%r14,%%rax\n"
+    "mulq %%r14\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += (c & M) * R */
+    "andq %%r15,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* t3 (stack) = d & M */
+    "movq %%rbx,%0\n"
+    "andq %%r15,%0\n"
+    "pushq %0\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* a4 *= 2 */
+    "addq %%r14,%%r14\n"
+    /* d += a0 * a4 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d+= (a1*2) * a3 */
+    "leaq (%%r11,%%r11,1),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += a2 * a2 */
+    "movq %%r12,%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += c * R */
+    "movq %%r8,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* t4 = d & M (%0) */
+    "movq %%rbx,%0\n"
+    "andq %%r15,%0\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* tx = t4 >> 48 (%%rbp, overwrites constant) */
+    "movq %0,%%rbp\n"
+    "shrq $48,%%rbp\n"
+    /* t4 &= (M >> 4) (stack) */
+    "movq $0xffffffffffff,%%rax\n"
+    "andq %%rax,%0\n"
+    "pushq %0\n"
+    /* c = a0 * a0 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r10\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += a1 * a4 */
+    "movq %%r11,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += (a2*2) * a3 */
+    "leaq (%%r12,%%r12,1),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* u0 = d & M (%0) */
+    "movq %%rbx,%0\n"
+    "andq %%r15,%0\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* u0 = (u0 << 4) | tx (%0) */
+    "shlq $4,%0\n"
+    "orq %%rbp,%0\n"
+    /* c += u0 * (R >> 4) */
+    "movq $0x1000003d1,%%rax\n"
+    "mulq %0\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[0] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,0(%1)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* a0 *= 2 */
+    "addq %%r10,%%r10\n"
+    /* c += a0 * a1 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a2 * a4 */
+    "movq %%r12,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += a3 * a3 */
+    "movq %%r13,%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* load R in %%rbp */
+    "movq $0x1000003d10,%%rbp\n"
+    /* c += (d & M) * R */
+    "movq %%rbx,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* r[1] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,8(%1)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a0 * a2 (last use of %%r10) */
+    "movq %%r10,%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* fetch t3 (%%r10, overwrites a0),t4 (%0) */
+    "popq %0\n"
+    "popq %%r10\n"
+    /* c += a1 * a1 */
+    "movq %%r11,%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a3 * a4 */
+    "movq %%r13,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c += (d & M) * R */
+    "movq %%rbx,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 (%%rbx only) */
+    "shrdq $52,%%rcx,%%rbx\n"
+    /* r[2] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,16(%1)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += t3 */
+    "addq %%r10,%%r8\n"
+    /* c += d * R */
+    "movq %%rbx,%%rax\n"
+    "mulq %%rbp\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[3] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,24(%1)\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* c += t4 (%%r8 only) */
+    "addq %0,%%r8\n"
+    /* r[4] = c */
+    "movq %%r8,32(%1)\n"
+
+    "popq %%rbp\n"
+: "+S"(a)
+: "D"(r)
+: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}

 #endif