Convert YASM code into inline assembly

This commit is contained in:
Pieter Wuille
2014-12-02 17:47:32 +01:00
parent f048615970
commit 67935050e1
6 changed files with 506 additions and 642 deletions

View File

@@ -1,529 +0,0 @@
;; Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille
;; Distributed under the MIT software license, see the accompanying
;; file COPYING or http://www.opensource.org/licenses/mit-license.php.
;; Changelog:
;; * March 2013, Diederik Huys: Original version
;; * November 2014, Pieter Wuille: Updated to use Peter Dettman's parallel
;; multiplication algorithm
;;
;; Provided public procedures:
;; secp256k1_fe_mul_inner
;; secp256k1_fe_sqr_inner
;;
;; Needed tools: YASM (http://yasm.tortall.net)
;;
;;
BITS 64
%ifidn __OUTPUT_FORMAT__,macho64
%define SYM(x) _ %+ x
%else
%define SYM(x) x
%endif
;; Procedure ExSetMult
;; Register Layout:
;; INPUT: rdi = a->n
;; rsi = b->n
;; rdx = r->a
;;
;; INTERNAL: rdx:rax = multiplication accumulator
;; r9:r8 = c
;; r10:r14 = a0-a4
;; rcx:rbx = d
;; rbp = R
;; rdi = t?
;; r15 = b->n
;; rsi = r->n
GLOBAL SYM(secp256k1_fe_mul_inner)
ALIGN 32
SYM(secp256k1_fe_mul_inner):
push rbp
push rbx
push r12
push r13
push r14
push r15
mov r10,[rdi+0*8]
mov r11,[rdi+1*8]
mov r12,[rdi+2*8]
mov r13,[rdi+3*8]
mov r14,[rdi+4*8]
mov rbp,01000003D10h
mov r15,rsi
mov rsi,rdx
;; d += a3 * b0
mov rax,[r15+0*8]
mul r13
mov rbx,rax
mov rcx,rdx
;; d += a2 * b1
mov rax,[r15+1*8]
mul r12
add rbx,rax
adc rcx,rdx
;; d += a1 * b2
mov rax,[r15+2*8]
mul r11
add rbx,rax
adc rcx,rdx
;; d = a0 * b3
mov rax,[r15+3*8]
mul r10
add rbx,rax
adc rcx,rdx
;; c = a4 * b4
mov rax,[r15+4*8]
mul r14
mov r8,rax
mov r9,rdx
;; d += (c & M) * R
mov rdx,0fffffffffffffh
and rax,rdx
mul rbp
add rbx,rax
adc rcx,rdx
;; c >>= 52 (r8 only)
shrd r8,r9,52
;; t3 (stack) = d & M
mov rdi,rbx
mov rdx,0fffffffffffffh
and rdi,rdx
push rdi
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; d += a4 * b0
mov rax,[r15+0*8]
mul r14
add rbx,rax
adc rcx,rdx
;; d += a3 * b1
mov rax,[r15+1*8]
mul r13
add rbx,rax
adc rcx,rdx
;; d += a2 * b2
mov rax,[r15+2*8]
mul r12
add rbx,rax
adc rcx,rdx
;; d += a1 * b3
mov rax,[r15+3*8]
mul r11
add rbx,rax
adc rcx,rdx
;; d += a0 * b4
mov rax,[r15+4*8]
mul r10
add rbx,rax
adc rcx,rdx
;; d += c * R
mov rax,r8
mul rbp
add rbx,rax
adc rcx,rdx
;; t4 = d & M (rdi)
mov rdi,rbx
mov rdx,0fffffffffffffh
and rdi,rdx
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; tx = t4 >> 48 (rbp, overwrites R)
mov rbp,rdi
shr rbp,48
;; t4 &= (M >> 4) (stack)
mov rax,0ffffffffffffh
and rdi,rax
push rdi
;; c = a0 * b0
mov rax,[r15+0*8]
mul r10
mov r8,rax
mov r9,rdx
;; d += a4 * b1
mov rax,[r15+1*8]
mul r14
add rbx,rax
adc rcx,rdx
;; d += a3 * b2
mov rax,[r15+2*8]
mul r13
add rbx,rax
adc rcx,rdx
;; d += a2 * b3
mov rax,[r15+3*8]
mul r12
add rbx,rax
adc rcx,rdx
;; d += a1 * b4
mov rax,[r15+4*8]
mul r11
add rbx,rax
adc rcx,rdx
;; u0 = d & M (rdi)
mov rdi,rbx
mov rdx,0fffffffffffffh
and rdi,rdx
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; u0 = (u0 << 4) | tx (rdi)
shl rdi,4
or rdi,rbp
;; c += u0 * (R >> 4)
mov rax,01000003D1h
mul rdi
add r8,rax
adc r9,rdx
;; r[0] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+0*8],rax
;; c >>= 52
shrd r8,r9,52
mov r9,0
;; c += a1 * b0
mov rax,[r15+0*8]
mul r11
add r8,rax
adc r9,rdx
;; c += a0 * b1
mov rax,[r15+1*8]
mul r10
add r8,rax
adc r9,rdx
;; d += a4 * b2
mov rax,[r15+2*8]
mul r14
add rbx,rax
adc rcx,rdx
;; d += a3 * b3
mov rax,[r15+3*8]
mul r13
add rbx,rax
adc rcx,rdx
;; d += a2 * b4
mov rax,[r15+4*8]
mul r12
add rbx,rax
adc rcx,rdx
;; restore rdp = R
mov rbp,01000003D10h
;; c += (d & M) * R
mov rax,rbx
mov rdx,0fffffffffffffh
and rax,rdx
mul rbp
add r8,rax
adc r9,rdx
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; r[1] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+8*1],rax
;; c >>= 52
shrd r8,r9,52
mov r9,0
;; c += a2 * b0
mov rax,[r15+0*8]
mul r12
add r8,rax
adc r9,rdx
;; c += a1 * b1
mov rax,[r15+1*8]
mul r11
add r8,rax
adc r9,rdx
;; c += a0 * b2 (last use of r10 = a0)
mov rax,[r15+2*8]
mul r10
add r8,rax
adc r9,rdx
;; fetch t3 (r10, overwrites a0),t4 (rdi)
pop rdi
pop r10
;; d += a4 * b3
mov rax,[r15+3*8]
mul r14
add rbx,rax
adc rcx,rdx
;; d += a3 * b4
mov rax,[r15+4*8]
mul r13
add rbx,rax
adc rcx,rdx
;; c += (d & M) * R
mov rax,rbx
mov rdx,0fffffffffffffh
and rax,rdx
mul rbp
add r8,rax
adc r9,rdx
;; d >>= 52 (rbx only)
shrd rbx,rcx,52
;; r[2] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+2*8],rax
;; c >>= 52
shrd r8,r9,52
mov r9,0
;; c += t3
add r8,r10
;; c += d * R
mov rax,rbx
mul rbp
add r8,rax
adc r9,rdx
;; r[3] = c & M
mov rax,r8
mov rdx,0fffffffffffffh
and rax,rdx
mov [rsi+3*8],rax
;; c >>= 52 (r8 only)
shrd r8,r9,52
;; c += t4 (r8 only)
add r8,rdi
;; r[4] = c
mov [rsi+4*8],r8
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret
;; PROC ExSetSquare
;; Register Layout:
;; INPUT: rdi = a.n
;; rsi = r.n
;; INTERNAL: rdx:rax = multiplication accumulator
;; r9:r8 = c
;; r10:r14 = a0-a4
;; rcx:rbx = d
;; rbp = R
;; rdi = t?
;; r15 = M
GLOBAL SYM(secp256k1_fe_sqr_inner)
ALIGN 32
SYM(secp256k1_fe_sqr_inner):
push rbp
push rbx
push r12
push r13
push r14
push r15
mov r10,[rdi+0*8]
mov r11,[rdi+1*8]
mov r12,[rdi+2*8]
mov r13,[rdi+3*8]
mov r14,[rdi+4*8]
mov rbp,01000003D10h
mov r15,0fffffffffffffh
;; d = (a0*2) * a3
lea rax,[r10*2]
mul r13
mov rbx,rax
mov rcx,rdx
;; d += (a1*2) * a2
lea rax,[r11*2]
mul r12
add rbx,rax
adc rcx,rdx
;; c = a4 * a4
mov rax,r14
mul r14
mov r8,rax
mov r9,rdx
;; d += (c & M) * R
and rax,r15
mul rbp
add rbx,rax
adc rcx,rdx
;; c >>= 52 (r8 only)
shrd r8,r9,52
;; t3 (stack) = d & M
mov rdi,rbx
and rdi,r15
push rdi
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; a4 *= 2
add r14,r14
;; d += a0 * a4
mov rax,r10
mul r14
add rbx,rax
adc rcx,rdx
;; d+= (a1*2) * a3
lea rax,[r11*2]
mul r13
add rbx,rax
adc rcx,rdx
;; d += a2 * a2
mov rax,r12
mul r12
add rbx,rax
adc rcx,rdx
;; d += c * R
mov rax,r8
mul rbp
add rbx,rax
adc rcx,rdx
;; t4 = d & M (rdi)
mov rdi,rbx
and rdi,r15
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; tx = t4 >> 48 (rbp, overwrites constant)
mov rbp,rdi
shr rbp,48
;; t4 &= (M >> 4) (stack)
mov rax,0ffffffffffffh
and rdi,rax
push rdi
;; c = a0 * a0
mov rax,r10
mul r10
mov r8,rax
mov r9,rdx
;; d += a1 * a4
mov rax,r11
mul r14
add rbx,rax
adc rcx,rdx
;; d += (a2*2) * a3
lea rax,[r12*2]
mul r13
add rbx,rax
adc rcx,rdx
;; u0 = d & M (rdi)
mov rdi,rbx
and rdi,r15
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; u0 = (u0 << 4) | tx (rdi)
shl rdi,4
or rdi,rbp
;; c += u0 * (R >> 4)
mov rax,01000003D1h
mul rdi
add r8,rax
adc r9,rdx
;; r[0] = c & M
mov rax,r8
and rax,r15
mov [rsi+0*8],rax
;; c >>= 52
shrd r8,r9,52
mov r9,0
;; a0 *= 2
add r10,r10
;; c += a0 * a1
mov rax,r10
mul r11
add r8,rax
adc r9,rdx
;; d += a2 * a4
mov rax,r12
mul r14
add rbx,rax
adc rcx,rdx
;; d += a3 * a3
mov rax,r13
mul r13
add rbx,rax
adc rcx,rdx
;; load R in rbp
mov rbp,01000003D10h
;; c += (d & M) * R
mov rax,rbx
and rax,r15
mul rbp
add r8,rax
adc r9,rdx
;; d >>= 52
shrd rbx,rcx,52
mov rcx,0
;; r[1] = c & M
mov rax,r8
and rax,r15
mov [rsi+8*1],rax
;; c >>= 52
shrd r8,r9,52
mov r9,0
;; c += a0 * a2 (last use of r10)
mov rax,r10
mul r12
add r8,rax
adc r9,rdx
;; fetch t3 (r10, overwrites a0),t4 (rdi)
pop rdi
pop r10
;; c += a1 * a1
mov rax,r11
mul r11
add r8,rax
adc r9,rdx
;; d += a3 * a4
mov rax,r13
mul r14
add rbx,rax
adc rcx,rdx
;; c += (d & M) * R
mov rax,rbx
and rax,r15
mul rbp
add r8,rax
adc r9,rdx
;; d >>= 52 (rbx only)
shrd rbx,rcx,52
;; r[2] = c & M
mov rax,r8
and rax,r15
mov [rsi+2*8],rax
;; c >>= 52
shrd r8,r9,52
mov r9,0
;; c += t3
add r8,r10
;; c += d * R
mov rax,rbx
mul rbp
add r8,rax
adc r9,rdx
;; r[3] = c & M
mov rax,r8
and rax,r15
mov [rsi+3*8],rax
;; c >>= 52 (r8 only)
shrd r8,r9,52
;; c += t4 (r8 only)
add r8,rdi
;; r[4] = c
mov [rsi+4*8],r8
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret

View File

@@ -1,13 +1,502 @@
/**********************************************************************
* Copyright (c) 2013 Pieter Wuille *
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
* Distributed under the MIT software license, see the accompanying *
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
**********************************************************************/
/**
* Changelog:
* - March 2013, Diederik Huys: original version
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
*/
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r);
void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r);
SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* r15:rcx = d
* r10-r14 = a0-a4
* rbx = b
* %2 = r
* %0 = a / t?
* rbp = R (0x1000003d10)
*/
__asm__ __volatile__(
"pushq %%rbp\n"
"movq 0(%0),%%r10\n"
"movq 8(%0),%%r11\n"
"movq 16(%0),%%r12\n"
"movq 24(%0),%%r13\n"
"movq 32(%0),%%r14\n"
"movq $0x1000003d10,%%rbp\n"
/* d += a3 * b0 */
"movq 0(%%rbx),%%rax\n"
"mulq %%r13\n"
"movq %%rax,%%rcx\n"
"movq %%rdx,%%r15\n"
/* d += a2 * b1 */
"movq 8(%%rbx),%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a1 * b2 */
"movq 16(%%rbx),%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d = a0 * b3 */
"movq 24(%%rbx),%%rax\n"
"mulq %%r10\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* c = a4 * b4 */
"movq 32(%%rbx),%%rax\n"
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += (c & M) * R */
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (stack) = d & M */
"movq %%rcx,%0\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n"
"pushq %0\n"
/* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n"
/* d += a4 * b0 */
"movq 0(%%rbx),%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a3 * b1 */
"movq 8(%%rbx),%%rax\n"
"mulq %%r13\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a2 * b2 */
"movq 16(%%rbx),%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a1 * b3 */
"movq 24(%%rbx),%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a0 * b4 */
"movq 32(%%rbx),%%rax\n"
"mulq %%r10\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* t4 = d & M (%0) */
"movq %%rcx,%0\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n"
/* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n"
/* tx = t4 >> 48 (%%rbp, overwrites R) */
"movq %0,%%rbp\n"
"shrq $48,%%rbp\n"
/* t4 &= (M >> 4) (stack) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%0\n"
"pushq %0\n"
/* c = a0 * b0 */
"movq 0(%%rbx),%%rax\n"
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += a4 * b1 */
"movq 8(%%rbx),%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a3 * b2 */
"movq 16(%%rbx),%%rax\n"
"mulq %%r13\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a2 * b3 */
"movq 24(%%rbx),%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a1 * b4 */
"movq 32(%%rbx),%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* u0 = d & M (%0) */
"movq %%rcx,%0\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n"
/* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n"
/* u0 = (u0 << 4) | tx (%0) */
"shlq $4,%0\n"
"orq %%rbp,%0\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %0\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,0(%2)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += a1 * b0 */
"movq 0(%%rbx),%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* c += a0 * b1 */
"movq 8(%%rbx),%%rax\n"
"mulq %%r10\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d += a4 * b2 */
"movq 16(%%rbx),%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a3 * b3 */
"movq 24(%%rbx),%%rax\n"
"mulq %%r13\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a2 * b4 */
"movq 32(%%rbx),%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* restore rdp = R */
"movq $0x1000003d10,%%rbp\n"
/* c += (d & M) * R */
"movq %%rcx,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,8(%2)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += a2 * b0 */
"movq 0(%%rbx),%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* c += a1 * b1 */
"movq 8(%%rbx),%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* c += a0 * b2 (last use of %%r10 = a0) */
"movq 16(%%rbx),%%rax\n"
"mulq %%r10\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */
"popq %0\n"
"popq %%r10\n"
/* d += a4 * b3 */
"movq 24(%%rbx),%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* d += a3 * b4 */
"movq 32(%%rbx),%%rax\n"
"mulq %%r13\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* c += (d & M) * R */
"movq %%rcx,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rcx only) */
"shrdq $52,%%r15,%%rcx\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,16(%2)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += t3 */
"addq %%r10,%%r8\n"
/* c += d * R */
"movq %%rcx,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,24(%2)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %0,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%2)\n"
"popq %%rbp\n"
: "+S"(a)
: "b"(b), "D"(r)
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
);
}
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) {
/**
* Registers: rdx:rax = multiplication accumulator
* r9:r8 = c
* rcx:rbx = d
* r10-r14 = a0-a4
* r15 = M (0xfffffffffffff)
* %1 = r
* %0 = a / t?
* rbp = R (0x1000003d10)
*/
__asm__ __volatile__(
"pushq %%rbp\n"
"movq 0(%0),%%r10\n"
"movq 8(%0),%%r11\n"
"movq 16(%0),%%r12\n"
"movq 24(%0),%%r13\n"
"movq 32(%0),%%r14\n"
"movq $0x1000003d10,%%rbp\n"
"movq $0xfffffffffffff,%%r15\n"
/* d = (a0*2) * a3 */
"leaq (%%r10,%%r10,1),%%rax\n"
"mulq %%r13\n"
"movq %%rax,%%rbx\n"
"movq %%rdx,%%rcx\n"
/* d += (a1*2) * a2 */
"leaq (%%r11,%%r11,1),%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c = a4 * a4 */
"movq %%r14,%%rax\n"
"mulq %%r14\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += (c & M) * R */
"andq %%r15,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (stack) = d & M */
"movq %%rbx,%0\n"
"andq %%r15,%0\n"
"pushq %0\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n"
/* a4 *= 2 */
"addq %%r14,%%r14\n"
/* d += a0 * a4 */
"movq %%r10,%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* d+= (a1*2) * a3 */
"leaq (%%r11,%%r11,1),%%rax\n"
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* d += a2 * a2 */
"movq %%r12,%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* t4 = d & M (%0) */
"movq %%rbx,%0\n"
"andq %%r15,%0\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n"
/* tx = t4 >> 48 (%%rbp, overwrites constant) */
"movq %0,%%rbp\n"
"shrq $48,%%rbp\n"
/* t4 &= (M >> 4) (stack) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%0\n"
"pushq %0\n"
/* c = a0 * a0 */
"movq %%r10,%%rax\n"
"mulq %%r10\n"
"movq %%rax,%%r8\n"
"movq %%rdx,%%r9\n"
/* d += a1 * a4 */
"movq %%r11,%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* d += (a2*2) * a3 */
"leaq (%%r12,%%r12,1),%%rax\n"
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* u0 = d & M (%0) */
"movq %%rbx,%0\n"
"andq %%r15,%0\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n"
/* u0 = (u0 << 4) | tx (%0) */
"shlq $4,%0\n"
"orq %%rbp,%0\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %0\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%1)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* a0 *= 2 */
"addq %%r10,%%r10\n"
/* c += a0 * a1 */
"movq %%r10,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d += a2 * a4 */
"movq %%r12,%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* d += a3 * a3 */
"movq %%r13,%%rax\n"
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* load R in %%rbp */
"movq $0x1000003d10,%%rbp\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n"
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%1)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += a0 * a2 (last use of %%r10) */
"movq %%r10,%%rax\n"
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */
"popq %0\n"
"popq %%r10\n"
/* c += a1 * a1 */
"movq %%r11,%%rax\n"
"mulq %%r11\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d += a3 * a4 */
"movq %%r13,%%rax\n"
"mulq %%r14\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rbx only) */
"shrdq $52,%%rcx,%%rbx\n"
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%1)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
/* c += t3 */
"addq %%r10,%%r8\n"
/* c += d * R */
"movq %%rbx,%%rax\n"
"mulq %%rbp\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%1)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %0,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%1)\n"
"popq %%rbp\n"
: "+S"(a)
: "D"(r)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
);
}
#endif