FreeRTOS-Kernel/FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/fe_x25519_asm.S
TakayukiMatsuo 94aa31c3cb
Update wolfSSL to the latest version(v.4.4.0) (#186)
* deleted old version wolfSSL before updating

* updated wolfSSL to the latest version(v4.4.0)

* updated wolfSSL to the latest version(v4.4.0)

* added macros for timing resistance

Co-authored-by: RichardBarry <3073890+RichardBarry@users.noreply.github.com>
Co-authored-by: Ming Yue <mingyue86010@gmail.com>
2020-08-07 15:58:14 -07:00

16542 lines
403 KiB
ArmAsm

/* fe_x25519_asm
*
* Copyright (C) 2006-2020 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#ifndef HAVE_INTEL_AVX1
#define HAVE_INTEL_AVX1
#endif /* HAVE_INTEL_AVX1 */
#ifndef NO_AVX2_SUPPORT
#define HAVE_INTEL_AVX2
#endif /* NO_AVX2_SUPPORT */
#ifndef __APPLE__
.text
.globl fe_init
.type fe_init,@function
.align 4
fe_init:
#else
.section __TEXT,__text
.globl _fe_init
.p2align 2
_fe_init:
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
#ifndef __APPLE__
movq cpuFlagsSet@GOTPCREL(%rip), %rax
movl (%rax), %eax
#else
movl _cpuFlagsSet(%rip), %eax
#endif /* __APPLE__ */
testl %eax, %eax
je L_fe_init_get_flags
repz retq
L_fe_init_get_flags:
#ifndef __APPLE__
callq cpuid_get_flags@plt
#else
callq _cpuid_get_flags
#endif /* __APPLE__ */
#ifndef __APPLE__
movq intelFlags@GOTPCREL(%rip), %rdx
movl %eax, (%rdx)
#else
movl %eax, _intelFlags(%rip)
#endif /* __APPLE__ */
andl $0x50, %eax
cmpl $0x50, %eax
jne L_fe_init_flags_done
#ifndef __APPLE__
movq fe_mul_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_mul_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_mul_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_mul_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_sq_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_sq_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_sq_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_sq_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_mul121666_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_mul121666_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_mul121666_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_mul121666_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_sq2_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_sq2_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_sq2_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_sq2_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_invert_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_invert_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_invert_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_invert_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq curve25519_avx2@GOTPCREL(%rip), %rax
#else
leaq _curve25519_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq curve25519_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _curve25519_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_pow22523_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_pow22523_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_pow22523_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_pow22523_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_ge_to_p2_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_ge_to_p2_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_ge_to_p3_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_ge_to_p3_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_ge_dbl_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_ge_dbl_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_ge_madd_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_madd_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_ge_madd_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_ge_msub_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_msub_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_ge_msub_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_add_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_ge_add_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_add_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_ge_add_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax
#else
leaq _fe_ge_sub_avx2(%rip), %rax
#endif /* __APPLE__ */
#ifndef __APPLE__
movq fe_ge_sub_p@GOTPCREL(%rip), %rdx
movq %rax, (%rdx)
#else
movq %rax, _fe_ge_sub_p(%rip)
#endif /* __APPLE__ */
L_fe_init_flags_done:
#ifndef __APPLE__
movq cpuFlagsSet@GOTPCREL(%rip), %rdx
movl $0x1, (%rdx)
#else
movl $0x1, _cpuFlagsSet(%rip)
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
repz retq
#ifndef __APPLE__
.size fe_init,.-fe_init
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_frombytes
.type fe_frombytes,@function
.align 4
fe_frombytes:
#else
.section __TEXT,__text
.globl _fe_frombytes
.p2align 2
_fe_frombytes:
#endif /* __APPLE__ */
movq $0x7fffffffffffffff, %r9
movq (%rsi), %rdx
movq 8(%rsi), %rax
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
andq %r9, %r8
movq %rdx, (%rdi)
movq %rax, 8(%rdi)
movq %rcx, 16(%rdi)
movq %r8, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_frombytes,.-fe_frombytes
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_tobytes
.type fe_tobytes,@function
.align 4
fe_tobytes:
#else
.section __TEXT,__text
.globl _fe_tobytes
.p2align 2
_fe_tobytes:
#endif /* __APPLE__ */
movq $0x7fffffffffffffff, %r10
movq (%rsi), %rdx
movq 8(%rsi), %rax
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
addq $19, %rdx
adcq $0x00, %rax
adcq $0x00, %rcx
adcq $0x00, %r8
shrq $63, %r8
imulq $19, %r8, %r9
movq (%rsi), %rdx
movq 8(%rsi), %rax
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
addq %r9, %rdx
adcq $0x00, %rax
adcq $0x00, %rcx
adcq $0x00, %r8
andq %r10, %r8
movq %rdx, (%rdi)
movq %rax, 8(%rdi)
movq %rcx, 16(%rdi)
movq %r8, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_tobytes,.-fe_tobytes
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_1
.type fe_1,@function
.align 4
fe_1:
#else
.section __TEXT,__text
.globl _fe_1
.p2align 2
_fe_1:
#endif /* __APPLE__ */
# Set one
movq $0x01, (%rdi)
movq $0x00, 8(%rdi)
movq $0x00, 16(%rdi)
movq $0x00, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_1,.-fe_1
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_0
.type fe_0,@function
.align 4
fe_0:
#else
.section __TEXT,__text
.globl _fe_0
.p2align 2
_fe_0:
#endif /* __APPLE__ */
# Set zero
movq $0x00, (%rdi)
movq $0x00, 8(%rdi)
movq $0x00, 16(%rdi)
movq $0x00, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_0,.-fe_0
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_copy
.type fe_copy,@function
.align 4
fe_copy:
#else
.section __TEXT,__text
.globl _fe_copy
.p2align 2
_fe_copy:
#endif /* __APPLE__ */
# Copy
movq (%rsi), %rdx
movq 8(%rsi), %rax
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
movq %rdx, (%rdi)
movq %rax, 8(%rdi)
movq %rcx, 16(%rdi)
movq %r8, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_copy,.-fe_copy
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sub
.type fe_sub,@function
.align 4
fe_sub:
#else
.section __TEXT,__text
.globl _fe_sub
.p2align 2
_fe_sub:
#endif /* __APPLE__ */
pushq %r12
# Sub
movq (%rsi), %rax
movq 8(%rsi), %rcx
movq 16(%rsi), %r8
movq 24(%rsi), %r9
subq (%rdx), %rax
movq $0x00, %r10
sbbq 8(%rdx), %rcx
movq $-19, %r11
sbbq 16(%rdx), %r8
movq $0x7fffffffffffffff, %r12
sbbq 24(%rdx), %r9
sbbq $0x00, %r10
# Mask the modulus
andq %r10, %r11
andq %r10, %r12
# Add modulus (if underflow)
addq %r11, %rax
adcq %r10, %rcx
adcq %r10, %r8
adcq %r12, %r9
movq %rax, (%rdi)
movq %rcx, 8(%rdi)
movq %r8, 16(%rdi)
movq %r9, 24(%rdi)
popq %r12
repz retq
#ifndef __APPLE__
.size fe_sub,.-fe_sub
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_add
.type fe_add,@function
.align 4
fe_add:
#else
.section __TEXT,__text
.globl _fe_add
.p2align 2
_fe_add:
#endif /* __APPLE__ */
pushq %r12
# Add
movq (%rsi), %rax
movq 8(%rsi), %rcx
addq (%rdx), %rax
movq 16(%rsi), %r8
adcq 8(%rdx), %rcx
movq 24(%rsi), %r10
adcq 16(%rdx), %r8
movq $-19, %r11
adcq 24(%rdx), %r10
movq $0x7fffffffffffffff, %r12
movq %r10, %r9
sarq $63, %r10
# Mask the modulus
andq %r10, %r11
andq %r10, %r12
# Sub modulus (if overflow)
subq %r11, %rax
sbbq %r10, %rcx
sbbq %r10, %r8
sbbq %r12, %r9
movq %rax, (%rdi)
movq %rcx, 8(%rdi)
movq %r8, 16(%rdi)
movq %r9, 24(%rdi)
popq %r12
repz retq
#ifndef __APPLE__
.size fe_add,.-fe_add
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_neg
.type fe_neg,@function
.align 4
fe_neg:
#else
.section __TEXT,__text
.globl _fe_neg
.p2align 2
_fe_neg:
#endif /* __APPLE__ */
movq $-19, %rdx
movq $-1, %rax
movq $-1, %rcx
movq $0x7fffffffffffffff, %r8
subq (%rsi), %rdx
sbbq 8(%rsi), %rax
sbbq 16(%rsi), %rcx
sbbq 24(%rsi), %r8
movq %rdx, (%rdi)
movq %rax, 8(%rdi)
movq %rcx, 16(%rdi)
movq %r8, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_neg,.-fe_neg
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_cmov
.type fe_cmov,@function
.align 4
fe_cmov:
#else
.section __TEXT,__text
.globl _fe_cmov
.p2align 2
_fe_cmov:
#endif /* __APPLE__ */
cmpl $0x01, %edx
movq (%rdi), %rcx
movq 8(%rdi), %r8
movq 16(%rdi), %r9
movq 24(%rdi), %r10
cmoveq (%rsi), %rcx
cmoveq 8(%rsi), %r8
cmoveq 16(%rsi), %r9
cmoveq 24(%rsi), %r10
movq %rcx, (%rdi)
movq %r8, 8(%rdi)
movq %r9, 16(%rdi)
movq %r10, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_cmov,.-fe_cmov
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_isnonzero
.type fe_isnonzero,@function
.align 4
fe_isnonzero:
#else
.section __TEXT,__text
.globl _fe_isnonzero
.p2align 2
_fe_isnonzero:
#endif /* __APPLE__ */
movq $0x7fffffffffffffff, %r10
movq (%rdi), %rax
movq 8(%rdi), %rdx
movq 16(%rdi), %rcx
movq 24(%rdi), %r8
addq $19, %rax
adcq $0x00, %rdx
adcq $0x00, %rcx
adcq $0x00, %r8
shrq $63, %r8
imulq $19, %r8, %r9
movq (%rdi), %rax
movq 8(%rdi), %rdx
movq 16(%rdi), %rcx
movq 24(%rdi), %r8
addq %r9, %rax
adcq $0x00, %rdx
adcq $0x00, %rcx
adcq $0x00, %r8
andq %r10, %r8
orq %rdx, %rax
orq %rcx, %rax
orq %r8, %rax
repz retq
#ifndef __APPLE__
.size fe_isnonzero,.-fe_isnonzero
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_isnegative
.type fe_isnegative,@function
.align 4
fe_isnegative:
#else
.section __TEXT,__text
.globl _fe_isnegative
.p2align 2
_fe_isnegative:
#endif /* __APPLE__ */
movq $0x7fffffffffffffff, %r11
movq (%rdi), %rdx
movq 8(%rdi), %rcx
movq 16(%rdi), %r8
movq 24(%rdi), %r9
movq %rdx, %rax
addq $19, %rdx
adcq $0x00, %rcx
adcq $0x00, %r8
adcq $0x00, %r9
shrq $63, %r9
imulq $19, %r9, %r10
addq %r10, %rax
andq $0x01, %rax
repz retq
#ifndef __APPLE__
.size fe_isnegative,.-fe_isnegative
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_cmov_table
.type fe_cmov_table,@function
.align 4
fe_cmov_table:
#else
.section __TEXT,__text
.globl _fe_cmov_table
.p2align 2
_fe_cmov_table:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %rdx, %rcx
movsbq %cl, %rax
cdq
xorb %dl, %al
subb %dl, %al
movb %al, %r15b
movq $0x01, %rax
xorq %rdx, %rdx
xorq %r8, %r8
xorq %r9, %r9
movq $0x01, %r10
xorq %r11, %r11
xorq %r12, %r12
xorq %r13, %r13
cmpb $0x01, %r15b
movq (%rsi), %r14
cmoveq %r14, %rax
movq 8(%rsi), %r14
cmoveq %r14, %rdx
movq 16(%rsi), %r14
cmoveq %r14, %r8
movq 24(%rsi), %r14
cmoveq %r14, %r9
movq 32(%rsi), %r14
cmoveq %r14, %r10
movq 40(%rsi), %r14
cmoveq %r14, %r11
movq 48(%rsi), %r14
cmoveq %r14, %r12
movq 56(%rsi), %r14
cmoveq %r14, %r13
cmpb $2, %r15b
movq 96(%rsi), %r14
cmoveq %r14, %rax
movq 104(%rsi), %r14
cmoveq %r14, %rdx
movq 112(%rsi), %r14
cmoveq %r14, %r8
movq 120(%rsi), %r14
cmoveq %r14, %r9
movq 128(%rsi), %r14
cmoveq %r14, %r10
movq 136(%rsi), %r14
cmoveq %r14, %r11
movq 144(%rsi), %r14
cmoveq %r14, %r12
movq 152(%rsi), %r14
cmoveq %r14, %r13
cmpb $3, %r15b
movq 192(%rsi), %r14
cmoveq %r14, %rax
movq 200(%rsi), %r14
cmoveq %r14, %rdx
movq 208(%rsi), %r14
cmoveq %r14, %r8
movq 216(%rsi), %r14
cmoveq %r14, %r9
movq 224(%rsi), %r14
cmoveq %r14, %r10
movq 232(%rsi), %r14
cmoveq %r14, %r11
movq 240(%rsi), %r14
cmoveq %r14, %r12
movq 248(%rsi), %r14
cmoveq %r14, %r13
cmpb $4, %r15b
movq 288(%rsi), %r14
cmoveq %r14, %rax
movq 296(%rsi), %r14
cmoveq %r14, %rdx
movq 304(%rsi), %r14
cmoveq %r14, %r8
movq 312(%rsi), %r14
cmoveq %r14, %r9
movq 320(%rsi), %r14
cmoveq %r14, %r10
movq 328(%rsi), %r14
cmoveq %r14, %r11
movq 336(%rsi), %r14
cmoveq %r14, %r12
movq 344(%rsi), %r14
cmoveq %r14, %r13
cmpb $5, %r15b
movq 384(%rsi), %r14
cmoveq %r14, %rax
movq 392(%rsi), %r14
cmoveq %r14, %rdx
movq 400(%rsi), %r14
cmoveq %r14, %r8
movq 408(%rsi), %r14
cmoveq %r14, %r9
movq 416(%rsi), %r14
cmoveq %r14, %r10
movq 424(%rsi), %r14
cmoveq %r14, %r11
movq 432(%rsi), %r14
cmoveq %r14, %r12
movq 440(%rsi), %r14
cmoveq %r14, %r13
cmpb $6, %r15b
movq 480(%rsi), %r14
cmoveq %r14, %rax
movq 488(%rsi), %r14
cmoveq %r14, %rdx
movq 496(%rsi), %r14
cmoveq %r14, %r8
movq 504(%rsi), %r14
cmoveq %r14, %r9
movq 512(%rsi), %r14
cmoveq %r14, %r10
movq 520(%rsi), %r14
cmoveq %r14, %r11
movq 528(%rsi), %r14
cmoveq %r14, %r12
movq 536(%rsi), %r14
cmoveq %r14, %r13
cmpb $7, %r15b
movq 576(%rsi), %r14
cmoveq %r14, %rax
movq 584(%rsi), %r14
cmoveq %r14, %rdx
movq 592(%rsi), %r14
cmoveq %r14, %r8
movq 600(%rsi), %r14
cmoveq %r14, %r9
movq 608(%rsi), %r14
cmoveq %r14, %r10
movq 616(%rsi), %r14
cmoveq %r14, %r11
movq 624(%rsi), %r14
cmoveq %r14, %r12
movq 632(%rsi), %r14
cmoveq %r14, %r13
cmpb $8, %r15b
movq 672(%rsi), %r14
cmoveq %r14, %rax
movq 680(%rsi), %r14
cmoveq %r14, %rdx
movq 688(%rsi), %r14
cmoveq %r14, %r8
movq 696(%rsi), %r14
cmoveq %r14, %r9
movq 704(%rsi), %r14
cmoveq %r14, %r10
movq 712(%rsi), %r14
cmoveq %r14, %r11
movq 720(%rsi), %r14
cmoveq %r14, %r12
movq 728(%rsi), %r14
cmoveq %r14, %r13
cmpb $0x00, %cl
movq %rax, %r14
cmovlq %r10, %rax
cmovlq %r14, %r10
movq %rdx, %r14
cmovlq %r11, %rdx
cmovlq %r14, %r11
movq %r8, %r14
cmovlq %r12, %r8
cmovlq %r14, %r12
movq %r9, %r14
cmovlq %r13, %r9
cmovlq %r14, %r13
movq %rax, (%rdi)
movq %rdx, 8(%rdi)
movq %r8, 16(%rdi)
movq %r9, 24(%rdi)
movq %r10, 32(%rdi)
movq %r11, 40(%rdi)
movq %r12, 48(%rdi)
movq %r13, 56(%rdi)
xorq %rax, %rax
xorq %rdx, %rdx
xorq %r8, %r8
xorq %r9, %r9
cmpb $0x01, %r15b
movq 64(%rsi), %r14
cmoveq %r14, %rax
movq 72(%rsi), %r14
cmoveq %r14, %rdx
movq 80(%rsi), %r14
cmoveq %r14, %r8
movq 88(%rsi), %r14
cmoveq %r14, %r9
cmpb $2, %r15b
movq 160(%rsi), %r14
cmoveq %r14, %rax
movq 168(%rsi), %r14
cmoveq %r14, %rdx
movq 176(%rsi), %r14
cmoveq %r14, %r8
movq 184(%rsi), %r14
cmoveq %r14, %r9
cmpb $3, %r15b
movq 256(%rsi), %r14
cmoveq %r14, %rax
movq 264(%rsi), %r14
cmoveq %r14, %rdx
movq 272(%rsi), %r14
cmoveq %r14, %r8
movq 280(%rsi), %r14
cmoveq %r14, %r9
cmpb $4, %r15b
movq 352(%rsi), %r14
cmoveq %r14, %rax
movq 360(%rsi), %r14
cmoveq %r14, %rdx
movq 368(%rsi), %r14
cmoveq %r14, %r8
movq 376(%rsi), %r14
cmoveq %r14, %r9
cmpb $5, %r15b
movq 448(%rsi), %r14
cmoveq %r14, %rax
movq 456(%rsi), %r14
cmoveq %r14, %rdx
movq 464(%rsi), %r14
cmoveq %r14, %r8
movq 472(%rsi), %r14
cmoveq %r14, %r9
cmpb $6, %r15b
movq 544(%rsi), %r14
cmoveq %r14, %rax
movq 552(%rsi), %r14
cmoveq %r14, %rdx
movq 560(%rsi), %r14
cmoveq %r14, %r8
movq 568(%rsi), %r14
cmoveq %r14, %r9
cmpb $7, %r15b
movq 640(%rsi), %r14
cmoveq %r14, %rax
movq 648(%rsi), %r14
cmoveq %r14, %rdx
movq 656(%rsi), %r14
cmoveq %r14, %r8
movq 664(%rsi), %r14
cmoveq %r14, %r9
cmpb $8, %r15b
movq 736(%rsi), %r14
cmoveq %r14, %rax
movq 744(%rsi), %r14
cmoveq %r14, %rdx
movq 752(%rsi), %r14
cmoveq %r14, %r8
movq 760(%rsi), %r14
cmoveq %r14, %r9
movq $-19, %r10
movq $-1, %r11
movq $-1, %r12
movq $0x7fffffffffffffff, %r13
subq %rax, %r10
sbbq %rdx, %r11
sbbq %r8, %r12
sbbq %r9, %r13
cmpb $0x00, %cl
cmovlq %r10, %rax
cmovlq %r11, %rdx
cmovlq %r12, %r8
cmovlq %r13, %r9
movq %rax, 64(%rdi)
movq %rdx, 72(%rdi)
movq %r8, 80(%rdi)
movq %r9, 88(%rdi)
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size fe_cmov_table,.-fe_cmov_table
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_mul
.type fe_mul,@function
.align 4
fe_mul:
#else
.section __TEXT,__text
.globl _fe_mul
.p2align 2
_fe_mul:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_mul_p(%rip)
#else
jmpq *_fe_mul_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_mul,.-fe_mul
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq
.type fe_sq,@function
.align 4
fe_sq:
#else
.section __TEXT,__text
.globl _fe_sq
.p2align 2
_fe_sq:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_sq_p(%rip)
#else
jmpq *_fe_sq_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_sq,.-fe_sq
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_mul121666
.type fe_mul121666,@function
.align 4
fe_mul121666:
#else
.section __TEXT,__text
.globl _fe_mul121666
.p2align 2
_fe_mul121666:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_mul121666_p(%rip)
#else
jmpq *_fe_mul121666_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_mul121666,.-fe_mul121666
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq2
.type fe_sq2,@function
.align 4
fe_sq2:
#else
.section __TEXT,__text
.globl _fe_sq2
.p2align 2
_fe_sq2:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_sq2_p(%rip)
#else
jmpq *_fe_sq2_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_sq2,.-fe_sq2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_invert
.type fe_invert,@function
.align 4
fe_invert:
#else
.section __TEXT,__text
.globl _fe_invert
.p2align 2
_fe_invert:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_invert_p(%rip)
#else
jmpq *_fe_invert_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_invert,.-fe_invert
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl curve25519
.type curve25519,@function
.align 4
curve25519:
#else
.section __TEXT,__text
.globl _curve25519
.p2align 2
_curve25519:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *curve25519_p(%rip)
#else
jmpq *_curve25519_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size curve25519,.-curve25519
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_pow22523
.type fe_pow22523,@function
.align 4
fe_pow22523:
#else
.section __TEXT,__text
.globl _fe_pow22523
.p2align 2
_fe_pow22523:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_pow22523_p(%rip)
#else
jmpq *_fe_pow22523_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_pow22523,.-fe_pow22523
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_to_p2
.type fe_ge_to_p2,@function
.align 4
fe_ge_to_p2:
#else
.section __TEXT,__text
.globl _fe_ge_to_p2
.p2align 2
_fe_ge_to_p2:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_ge_to_p2_p(%rip)
#else
jmpq *_fe_ge_to_p2_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_ge_to_p2,.-fe_ge_to_p2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_to_p3
.type fe_ge_to_p3,@function
.align 4
fe_ge_to_p3:
#else
.section __TEXT,__text
.globl _fe_ge_to_p3
.p2align 2
_fe_ge_to_p3:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_ge_to_p3_p(%rip)
#else
jmpq *_fe_ge_to_p3_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_ge_to_p3,.-fe_ge_to_p3
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_dbl
.type fe_ge_dbl,@function
.align 4
fe_ge_dbl:
#else
.section __TEXT,__text
.globl _fe_ge_dbl
.p2align 2
_fe_ge_dbl:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_ge_dbl_p(%rip)
#else
jmpq *_fe_ge_dbl_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_ge_dbl,.-fe_ge_dbl
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_madd
.type fe_ge_madd,@function
.align 4
fe_ge_madd:
#else
.section __TEXT,__text
.globl _fe_ge_madd
.p2align 2
_fe_ge_madd:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_ge_madd_p(%rip)
#else
jmpq *_fe_ge_madd_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_ge_madd,.-fe_ge_madd
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_msub
.type fe_ge_msub,@function
.align 4
fe_ge_msub:
#else
.section __TEXT,__text
.globl _fe_ge_msub
.p2align 2
_fe_ge_msub:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_ge_msub_p(%rip)
#else
jmpq *_fe_ge_msub_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_ge_msub,.-fe_ge_msub
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_add
.type fe_ge_add,@function
.align 4
fe_ge_add:
#else
.section __TEXT,__text
.globl _fe_ge_add
.p2align 2
_fe_ge_add:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_ge_add_p(%rip)
#else
jmpq *_fe_ge_add_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_ge_add,.-fe_ge_add
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_sub
.type fe_ge_sub,@function
.align 4
fe_ge_sub:
#else
.section __TEXT,__text
.globl _fe_ge_sub
.p2align 2
_fe_ge_sub:
#endif /* __APPLE__ */
#ifndef __APPLE__
jmpq *fe_ge_sub_p(%rip)
#else
jmpq *_fe_ge_sub_p(%rip)
#endif /* __APPLE__ */
#ifndef __APPLE__
.size fe_ge_sub,.-fe_ge_sub
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type cpuFlagsSet, @object
.size cpuFlagsSet,4
cpuFlagsSet:
.long 0
#else
.section __DATA,__data
.p2align 2
_cpuFlagsSet:
.long 0
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type intelFlags, @object
.size intelFlags,4
intelFlags:
.long 0
#else
.section __DATA,__data
.p2align 2
_intelFlags:
.long 0
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_mul_p, @object
.size fe_mul_p,8
fe_mul_p:
.quad fe_mul_x64
#else
.section __DATA,__data
.p2align 2
_fe_mul_p:
.quad _fe_mul_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_sq_p, @object
.size fe_sq_p,8
fe_sq_p:
.quad fe_sq_x64
#else
.section __DATA,__data
.p2align 2
_fe_sq_p:
.quad _fe_sq_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_mul121666_p, @object
.size fe_mul121666_p,8
fe_mul121666_p:
.quad fe_mul121666_x64
#else
.section __DATA,__data
.p2align 2
_fe_mul121666_p:
.quad _fe_mul121666_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_sq2_p, @object
.size fe_sq2_p,8
fe_sq2_p:
.quad fe_sq2_x64
#else
.section __DATA,__data
.p2align 2
_fe_sq2_p:
.quad _fe_sq2_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_invert_p, @object
.size fe_invert_p,8
fe_invert_p:
.quad fe_invert_x64
#else
.section __DATA,__data
.p2align 2
_fe_invert_p:
.quad _fe_invert_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type curve25519_p, @object
.size curve25519_p,8
curve25519_p:
.quad curve25519_x64
#else
.section __DATA,__data
.p2align 2
_curve25519_p:
.quad _curve25519_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_pow22523_p, @object
.size fe_pow22523_p,8
fe_pow22523_p:
.quad fe_pow22523_x64
#else
.section __DATA,__data
.p2align 2
_fe_pow22523_p:
.quad _fe_pow22523_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_ge_to_p2_p, @object
.size fe_ge_to_p2_p,8
fe_ge_to_p2_p:
.quad fe_ge_to_p2_x64
#else
.section __DATA,__data
.p2align 2
_fe_ge_to_p2_p:
.quad _fe_ge_to_p2_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_ge_to_p3_p, @object
.size fe_ge_to_p3_p,8
fe_ge_to_p3_p:
.quad fe_ge_to_p3_x64
#else
.section __DATA,__data
.p2align 2
_fe_ge_to_p3_p:
.quad _fe_ge_to_p3_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_ge_dbl_p, @object
.size fe_ge_dbl_p,8
fe_ge_dbl_p:
.quad fe_ge_dbl_x64
#else
.section __DATA,__data
.p2align 2
_fe_ge_dbl_p:
.quad _fe_ge_dbl_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_ge_madd_p, @object
.size fe_ge_madd_p,8
fe_ge_madd_p:
.quad fe_ge_madd_x64
#else
.section __DATA,__data
.p2align 2
_fe_ge_madd_p:
.quad _fe_ge_madd_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_ge_msub_p, @object
.size fe_ge_msub_p,8
fe_ge_msub_p:
.quad fe_ge_msub_x64
#else
.section __DATA,__data
.p2align 2
_fe_ge_msub_p:
.quad _fe_ge_msub_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_ge_add_p, @object
.size fe_ge_add_p,8
fe_ge_add_p:
.quad fe_ge_add_x64
#else
.section __DATA,__data
.p2align 2
_fe_ge_add_p:
.quad _fe_ge_add_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
.type fe_ge_sub_p, @object
.size fe_ge_sub_p,8
fe_ge_sub_p:
.quad fe_ge_sub_x64
#else
.section __DATA,__data
.p2align 2
_fe_ge_sub_p:
.quad _fe_ge_sub_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_mul_x64
.type fe_mul_x64,@function
.align 4
fe_mul_x64:
#else
.section __TEXT,__text
.globl _fe_mul_x64
.p2align 2
_fe_mul_x64:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbx
movq %rdx, %rcx
# Multiply
# A[0] * B[0]
movq (%rcx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rcx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rcx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rcx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rcx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rcx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rcx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rcx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rcx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rcx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rcx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rcx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rcx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rcx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rcx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rcx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
popq %rbx
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size fe_mul_x64,.-fe_mul_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq_x64
.type fe_sq_x64,@function
.align 4
fe_sq_x64:
#else
.section __TEXT,__text
.globl _fe_sq_x64
.p2align 2
_fe_sq_x64:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
pushq %r14
pushq %r15
# Square
# A[0] * A[1]
movq (%rsi), %rax
mulq 8(%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * A[2]
movq (%rsi), %rax
mulq 16(%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[0] * A[3]
movq (%rsi), %rax
mulq 24(%rsi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[1] * A[2]
movq 8(%rsi), %rax
mulq 16(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[1] * A[3]
movq 8(%rsi), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
# A[2] * A[3]
movq 16(%rsi), %rax
mulq 24(%rsi)
xorq %r13, %r13
addq %rax, %r12
adcq %rdx, %r13
# Double
xorq %r14, %r14
addq %r8, %r8
adcq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq $0x00, %r14
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %rcx
movq %rdx, %r15
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
addq %r15, %r8
adcq %rax, %r9
adcq $0x00, %rdx
movq %rdx, %r15
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
addq %r15, %r10
adcq %rax, %r11
adcq $0x00, %rdx
movq %rdx, %r15
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
addq %rax, %r13
adcq %rdx, %r14
addq %r15, %r12
adcq $0x00, %r13
adcq $0x00, %r14
# Reduce
movq $0x7fffffffffffffff, %r15
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
shldq $0x01, %r10, %r11
andq %r15, %r10
# Multiply top half by 19
movq $19, %rax
mulq %r11
xorq %r11, %r11
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r11
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
# Add remaining product results in
addq %r11, %r8
adcq %r12, %r9
adcq %r13, %r10
adcq %rax, %r10
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r10, %rdx
imulq $19, %rdx, %rax
andq %r15, %r10
addq %rax, %rcx
adcq $0x00, %r8
adcq $0x00, %r9
adcq $0x00, %r10
# Reduce if top bit set
movq %r10, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %r15, %r10
addq %rax, %rcx
adcq $0x00, %r8
adcq $0x00, %r9
adcq $0x00, %r10
# Store
movq %rcx, (%rdi)
movq %r8, 8(%rdi)
movq %r9, 16(%rdi)
movq %r10, 24(%rdi)
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size fe_sq_x64,.-fe_sq_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq_n_x64
.type fe_sq_n_x64,@function
.align 4
fe_sq_n_x64:
#else
.section __TEXT,__text
.globl _fe_sq_n_x64
.p2align 2
_fe_sq_n_x64:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbx
movq %rdx, %rcx
L_fe_sq_n_x64:
# Square
# A[0] * A[1]
movq (%rsi), %rax
mulq 8(%rsi)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq (%rsi), %rax
mulq 16(%rsi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq (%rsi), %rax
mulq 24(%rsi)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 8(%rsi), %rax
mulq 16(%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 8(%rsi), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 16(%rsi), %rax
mulq 24(%rsi)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %r8
movq %rdx, %rbx
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
addq %rbx, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rbx
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
addq %rbx, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rbx
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rbx, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
decb %cl
jnz L_fe_sq_n_x64
popq %rbx
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size fe_sq_n_x64,.-fe_sq_n_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_mul121666_x64
.type fe_mul121666_x64,@function
.align 4
fe_mul121666_x64:
#else
.section __TEXT,__text
.globl _fe_mul121666_x64
.p2align 2
_fe_mul121666_x64:
#endif /* __APPLE__ */
pushq %r12
# Multiply by 121666
movq $0x1db42, %rax
mulq (%rsi)
xorq %r10, %r10
movq %rax, %r8
movq %rdx, %r9
movq $0x1db42, %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
movq $0x1db42, %rax
mulq 16(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
movq $0x1db42, %rax
mulq 24(%rsi)
movq $0x7fffffffffffffff, %rcx
addq %rax, %r11
adcq %rdx, %r12
shldq $0x01, %r11, %r12
andq %rcx, %r11
movq $19, %rax
mulq %r12
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
popq %r12
repz retq
#ifndef __APPLE__
.size fe_mul121666_x64,.-fe_mul121666_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq2_x64
.type fe_sq2_x64,@function
.align 4
fe_sq2_x64:
#else
.section __TEXT,__text
.globl _fe_sq2_x64
.p2align 2
_fe_sq2_x64:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbx
# Square * 2
# A[0] * A[1]
movq (%rsi), %rax
mulq 8(%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * A[2]
movq (%rsi), %rax
mulq 16(%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[0] * A[3]
movq (%rsi), %rax
mulq 24(%rsi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[1] * A[2]
movq 8(%rsi), %rax
mulq 16(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[1] * A[3]
movq 8(%rsi), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
# A[2] * A[3]
movq 16(%rsi), %rax
mulq 24(%rsi)
xorq %r13, %r13
addq %rax, %r12
adcq %rdx, %r13
# Double
xorq %r14, %r14
addq %r8, %r8
adcq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq $0x00, %r14
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %rcx
movq %rdx, %r15
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
addq %r15, %r8
adcq %rax, %r9
adcq $0x00, %rdx
movq %rdx, %r15
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
addq %r15, %r10
adcq %rax, %r11
adcq $0x00, %rdx
movq %rdx, %r15
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
addq %rax, %r13
adcq %rdx, %r14
addq %r15, %r12
adcq $0x00, %r13
adcq $0x00, %r14
# Reduce
movq $0x7fffffffffffffff, %rbx
xorq %rax, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $3, %r14, %rax
shldq $2, %r13, %r14
shldq $2, %r12, %r13
shldq $2, %r11, %r12
shldq $2, %r10, %r11
shldq $0x01, %r9, %r10
shldq $0x01, %r8, %r9
shldq $0x01, %rcx, %r8
shlq $0x01, %rcx
andq %rbx, %r10
# Two out left, one in right
andq %rbx, %r14
# Multiply top bits by 19*19
imulq $0x169, %rax, %r15
# Multiply top half by 19
movq $19, %rax
mulq %r11
xorq %r11, %r11
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r11
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
# Add remaining produce results in
addq %r15, %rcx
adcq %r11, %r8
adcq %r12, %r9
adcq %r13, %r10
adcq %rax, %r10
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r10, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r10
addq %rax, %rcx
adcq $0x00, %r8
adcq $0x00, %r9
adcq $0x00, %r10
# Reduce if top bit set
movq %r10, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r10
addq %rax, %rcx
adcq $0x00, %r8
adcq $0x00, %r9
adcq $0x00, %r10
# Store
movq %rcx, (%rdi)
movq %r8, 8(%rdi)
movq %r9, 16(%rdi)
movq %r10, 24(%rdi)
popq %rbx
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size fe_sq2_x64,.-fe_sq2_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_invert_x64
.type fe_invert_x64,@function
.align 4
fe_invert_x64:
#else
.section __TEXT,__text
.globl _fe_invert_x64
.p2align 2
_fe_invert_x64:
#endif /* __APPLE__ */
subq $0x90, %rsp
# Invert
movq %rdi, 128(%rsp)
movq %rsi, 136(%rsp)
movq %rsp, %rdi
movq 136(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq 136(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $19, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $0x63, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
movq 128(%rsp), %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
movq 136(%rsp), %rsi
movq 128(%rsp), %rdi
addq $0x90, %rsp
repz retq
#ifndef __APPLE__
.text
.globl curve25519_x64
.type curve25519_x64,@function
.align 4
curve25519_x64:
#else
.section __TEXT,__text
.globl _curve25519_x64
.p2align 2
_curve25519_x64:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbx
pushq %rbp
movq %rdx, %r8
subq $0xb8, %rsp
xorq %rbx, %rbx
movq %rdi, 176(%rsp)
# Set one
movq $0x01, (%rdi)
movq $0x00, 8(%rdi)
movq $0x00, 16(%rdi)
movq $0x00, 24(%rdi)
# Set zero
movq $0x00, (%rsp)
movq $0x00, 8(%rsp)
movq $0x00, 16(%rsp)
movq $0x00, 24(%rsp)
# Set one
movq $0x01, 32(%rsp)
movq $0x00, 40(%rsp)
movq $0x00, 48(%rsp)
movq $0x00, 56(%rsp)
# Copy
movq (%r8), %rcx
movq 8(%r8), %r9
movq 16(%r8), %r10
movq 24(%r8), %r11
movq %rcx, 64(%rsp)
movq %r9, 72(%rsp)
movq %r10, 80(%rsp)
movq %r11, 88(%rsp)
movb $62, 168(%rsp)
movq $3, 160(%rsp)
L_curve25519_x64_words:
L_curve25519_x64_bits:
movq 160(%rsp), %r9
movb 168(%rsp), %cl
movq (%rsi,%r9,8), %rbp
shrq %cl, %rbp
andq $0x01, %rbp
xorq %rbp, %rbx
negq %rbx
# Conditional Swap
movq (%rdi), %rcx
movq 8(%rdi), %r9
movq 16(%rdi), %r10
movq 24(%rdi), %r11
xorq 64(%rsp), %rcx
xorq 72(%rsp), %r9
xorq 80(%rsp), %r10
xorq 88(%rsp), %r11
andq %rbx, %rcx
andq %rbx, %r9
andq %rbx, %r10
andq %rbx, %r11
xorq %rcx, (%rdi)
xorq %r9, 8(%rdi)
xorq %r10, 16(%rdi)
xorq %r11, 24(%rdi)
xorq %rcx, 64(%rsp)
xorq %r9, 72(%rsp)
xorq %r10, 80(%rsp)
xorq %r11, 88(%rsp)
# Conditional Swap
movq (%rsp), %rcx
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
xorq 32(%rsp), %rcx
xorq 40(%rsp), %r9
xorq 48(%rsp), %r10
xorq 56(%rsp), %r11
andq %rbx, %rcx
andq %rbx, %r9
andq %rbx, %r10
andq %rbx, %r11
xorq %rcx, (%rsp)
xorq %r9, 8(%rsp)
xorq %r10, 16(%rsp)
xorq %r11, 24(%rsp)
xorq %rcx, 32(%rsp)
xorq %r9, 40(%rsp)
xorq %r10, 48(%rsp)
xorq %r11, 56(%rsp)
movq %rbp, %rbx
# Add
movq (%rdi), %rcx
movq 8(%rdi), %r9
movq 16(%rdi), %r10
movq 24(%rdi), %rbp
movq %rcx, %r12
addq (%rsp), %rcx
movq %r9, %r13
adcq 8(%rsp), %r9
movq %r10, %r14
adcq 16(%rsp), %r10
movq %rbp, %r15
adcq 24(%rsp), %rbp
movq $-19, %rax
movq %rbp, %r11
movq $0x7fffffffffffffff, %rdx
sarq $63, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Sub modulus (if overflow)
subq %rax, %rcx
sbbq %rbp, %r9
sbbq %rbp, %r10
sbbq %rdx, %r11
# Sub
subq (%rsp), %r12
movq $0x00, %rbp
sbbq 8(%rsp), %r13
movq $-19, %rax
sbbq 16(%rsp), %r14
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rsp), %r15
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %r12
adcq %rbp, %r13
adcq %rbp, %r14
adcq %rdx, %r15
movq %rcx, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, 128(%rsp)
movq %r13, 136(%rsp)
movq %r14, 144(%rsp)
movq %r15, 152(%rsp)
# Add
movq 64(%rsp), %rcx
movq 72(%rsp), %r9
movq 80(%rsp), %r10
movq 88(%rsp), %rbp
movq %rcx, %r12
addq 32(%rsp), %rcx
movq %r9, %r13
adcq 40(%rsp), %r9
movq %r10, %r14
adcq 48(%rsp), %r10
movq %rbp, %r15
adcq 56(%rsp), %rbp
movq $-19, %rax
movq %rbp, %r11
movq $0x7fffffffffffffff, %rdx
sarq $63, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Sub modulus (if overflow)
subq %rax, %rcx
sbbq %rbp, %r9
sbbq %rbp, %r10
sbbq %rdx, %r11
# Sub
subq 32(%rsp), %r12
movq $0x00, %rbp
sbbq 40(%rsp), %r13
movq $-19, %rax
sbbq 48(%rsp), %r14
movq $0x7fffffffffffffff, %rdx
sbbq 56(%rsp), %r15
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %r12
adcq %rbp, %r13
adcq %rbp, %r14
adcq %rdx, %r15
movq %rcx, (%rsp)
movq %r9, 8(%rsp)
movq %r10, 16(%rsp)
movq %r11, 24(%rsp)
movq %r12, 96(%rsp)
movq %r13, 104(%rsp)
movq %r14, 112(%rsp)
movq %r15, 120(%rsp)
# Multiply
# A[0] * B[0]
movq (%rdi), %rax
mulq 96(%rsp)
movq %rax, %rcx
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rdi), %rax
mulq 96(%rsp)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rdi), %rax
mulq 104(%rsp)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rdi), %rax
mulq 96(%rsp)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rdi), %rax
mulq 104(%rsp)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rdi), %rax
mulq 112(%rsp)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rdi), %rax
mulq 96(%rsp)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rdi), %rax
mulq 104(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rdi), %rax
mulq 112(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rdi), %rax
mulq 120(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rdi), %rax
mulq 104(%rsp)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rdi), %rax
mulq 112(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rdi), %rax
mulq 120(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rdi), %rax
mulq 112(%rsp)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rdi), %rax
mulq 120(%rsp)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rdi), %rax
mulq 120(%rsp)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, 32(%rsp)
movq %r9, 40(%rsp)
movq %r10, 48(%rsp)
movq %r11, 56(%rsp)
# Multiply
# A[0] * B[0]
movq 128(%rsp), %rax
mulq (%rsp)
movq %rax, %rcx
movq %rdx, %r9
# A[0] * B[1]
movq 136(%rsp), %rax
mulq (%rsp)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq 128(%rsp), %rax
mulq 8(%rsp)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 144(%rsp), %rax
mulq (%rsp)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 136(%rsp), %rax
mulq 8(%rsp)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq 128(%rsp), %rax
mulq 16(%rsp)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 152(%rsp), %rax
mulq (%rsp)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 144(%rsp), %rax
mulq 8(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 136(%rsp), %rax
mulq 16(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq 128(%rsp), %rax
mulq 24(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 152(%rsp), %rax
mulq 8(%rsp)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 144(%rsp), %rax
mulq 16(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 136(%rsp), %rax
mulq 24(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 152(%rsp), %rax
mulq 16(%rsp)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 144(%rsp), %rax
mulq 24(%rsp)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 152(%rsp), %rax
mulq 24(%rsp)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, (%rsp)
movq %r9, 8(%rsp)
movq %r10, 16(%rsp)
movq %r11, 24(%rsp)
# Square
# A[0] * A[1]
movq 128(%rsp), %rax
mulq 136(%rsp)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq 128(%rsp), %rax
mulq 144(%rsp)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq 128(%rsp), %rax
mulq 152(%rsp)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 136(%rsp), %rax
mulq 144(%rsp)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 136(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 144(%rsp), %rax
mulq 152(%rsp)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq 128(%rsp), %rax
mulq %rax
movq %rax, %rcx
movq %rdx, %rbp
# A[1] * A[1]
movq 136(%rsp), %rax
mulq %rax
addq %rbp, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rbp
# A[2] * A[2]
movq 144(%rsp), %rax
mulq %rax
addq %rbp, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rbp
# A[3] * A[3]
movq 152(%rsp), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rbp, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, 96(%rsp)
movq %r9, 104(%rsp)
movq %r10, 112(%rsp)
movq %r11, 120(%rsp)
# Square
# A[0] * A[1]
movq (%rdi), %rax
mulq 8(%rdi)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq (%rdi), %rax
mulq 16(%rdi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq (%rdi), %rax
mulq 24(%rdi)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 8(%rdi), %rax
mulq 16(%rdi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 8(%rdi), %rax
mulq 24(%rdi)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 16(%rdi), %rax
mulq 24(%rdi)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq (%rdi), %rax
mulq %rax
movq %rax, %rcx
movq %rdx, %rbp
# A[1] * A[1]
movq 8(%rdi), %rax
mulq %rax
addq %rbp, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rbp
# A[2] * A[2]
movq 16(%rdi), %rax
mulq %rax
addq %rbp, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rbp
# A[3] * A[3]
movq 24(%rdi), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rbp, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, 128(%rsp)
movq %r9, 136(%rsp)
movq %r10, 144(%rsp)
movq %r11, 152(%rsp)
# Add
movq 32(%rsp), %rcx
movq 40(%rsp), %r9
movq 48(%rsp), %r10
movq 56(%rsp), %rbp
movq %rcx, %r12
addq (%rsp), %rcx
movq %r9, %r13
adcq 8(%rsp), %r9
movq %r10, %r14
adcq 16(%rsp), %r10
movq %rbp, %r15
adcq 24(%rsp), %rbp
movq $-19, %rax
movq %rbp, %r11
movq $0x7fffffffffffffff, %rdx
sarq $63, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Sub modulus (if overflow)
subq %rax, %rcx
sbbq %rbp, %r9
sbbq %rbp, %r10
sbbq %rdx, %r11
# Sub
subq (%rsp), %r12
movq $0x00, %rbp
sbbq 8(%rsp), %r13
movq $-19, %rax
sbbq 16(%rsp), %r14
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rsp), %r15
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %r12
adcq %rbp, %r13
adcq %rbp, %r14
adcq %rdx, %r15
movq %rcx, 64(%rsp)
movq %r9, 72(%rsp)
movq %r10, 80(%rsp)
movq %r11, 88(%rsp)
movq %r12, (%rsp)
movq %r13, 8(%rsp)
movq %r14, 16(%rsp)
movq %r15, 24(%rsp)
# Multiply
# A[0] * B[0]
movq 96(%rsp), %rax
mulq 128(%rsp)
movq %rax, %rcx
movq %rdx, %r9
# A[0] * B[1]
movq 104(%rsp), %rax
mulq 128(%rsp)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq 96(%rsp), %rax
mulq 136(%rsp)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 112(%rsp), %rax
mulq 128(%rsp)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 104(%rsp), %rax
mulq 136(%rsp)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq 96(%rsp), %rax
mulq 144(%rsp)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 120(%rsp), %rax
mulq 128(%rsp)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 112(%rsp), %rax
mulq 136(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 104(%rsp), %rax
mulq 144(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq 96(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 120(%rsp), %rax
mulq 136(%rsp)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 112(%rsp), %rax
mulq 144(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 104(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 120(%rsp), %rax
mulq 144(%rsp)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 112(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 120(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
# Sub
movq 128(%rsp), %rcx
movq 136(%rsp), %r9
movq 144(%rsp), %r10
movq 152(%rsp), %r11
subq 96(%rsp), %rcx
movq $0x00, %rbp
sbbq 104(%rsp), %r9
movq $-19, %rax
sbbq 112(%rsp), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 120(%rsp), %r11
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %rcx
adcq %rbp, %r9
adcq %rbp, %r10
adcq %rdx, %r11
movq %rcx, 128(%rsp)
movq %r9, 136(%rsp)
movq %r10, 144(%rsp)
movq %r11, 152(%rsp)
# Square
# A[0] * A[1]
movq (%rsp), %rax
mulq 8(%rsp)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq (%rsp), %rax
mulq 16(%rsp)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq (%rsp), %rax
mulq 24(%rsp)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 8(%rsp), %rax
mulq 16(%rsp)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 8(%rsp), %rax
mulq 24(%rsp)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 16(%rsp), %rax
mulq 24(%rsp)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq (%rsp), %rax
mulq %rax
movq %rax, %rcx
movq %rdx, %rbp
# A[1] * A[1]
movq 8(%rsp), %rax
mulq %rax
addq %rbp, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rbp
# A[2] * A[2]
movq 16(%rsp), %rax
mulq %rax
addq %rbp, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rbp
# A[3] * A[3]
movq 24(%rsp), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rbp, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, (%rsp)
movq %r9, 8(%rsp)
movq %r10, 16(%rsp)
movq %r11, 24(%rsp)
# Multiply by 121666
movq $0x1db42, %rax
mulq 128(%rsp)
xorq %r10, %r10
movq %rax, %rcx
movq %rdx, %r9
movq $0x1db42, %rax
mulq 136(%rsp)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
movq $0x1db42, %rax
mulq 144(%rsp)
xorq %r13, %r13
addq %rax, %r10
adcq %rdx, %r11
movq $0x1db42, %rax
mulq 152(%rsp)
movq $0x7fffffffffffffff, %r12
addq %rax, %r11
adcq %rdx, %r13
shldq $0x01, %r11, %r13
andq %r12, %r11
movq $19, %rax
mulq %r13
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
movq %rcx, 32(%rsp)
movq %r9, 40(%rsp)
movq %r10, 48(%rsp)
movq %r11, 56(%rsp)
# Square
# A[0] * A[1]
movq 64(%rsp), %rax
mulq 72(%rsp)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq 64(%rsp), %rax
mulq 80(%rsp)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq 64(%rsp), %rax
mulq 88(%rsp)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 72(%rsp), %rax
mulq 80(%rsp)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 72(%rsp), %rax
mulq 88(%rsp)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 80(%rsp), %rax
mulq 88(%rsp)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq 64(%rsp), %rax
mulq %rax
movq %rax, %rcx
movq %rdx, %rbp
# A[1] * A[1]
movq 72(%rsp), %rax
mulq %rax
addq %rbp, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rbp
# A[2] * A[2]
movq 80(%rsp), %rax
mulq %rax
addq %rbp, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rbp
# A[3] * A[3]
movq 88(%rsp), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rbp, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, 64(%rsp)
movq %r9, 72(%rsp)
movq %r10, 80(%rsp)
movq %r11, 88(%rsp)
# Add
movq 96(%rsp), %rcx
movq 104(%rsp), %r9
addq 32(%rsp), %rcx
movq 112(%rsp), %r10
adcq 40(%rsp), %r9
movq 120(%rsp), %rbp
adcq 48(%rsp), %r10
movq $-19, %rax
adcq 56(%rsp), %rbp
movq $0x7fffffffffffffff, %rdx
movq %rbp, %r11
sarq $63, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Sub modulus (if overflow)
subq %rax, %rcx
sbbq %rbp, %r9
sbbq %rbp, %r10
sbbq %rdx, %r11
movq %rcx, 96(%rsp)
movq %r9, 104(%rsp)
movq %r10, 112(%rsp)
movq %r11, 120(%rsp)
# Multiply
# A[0] * B[0]
movq (%rsp), %rax
mulq (%r8)
movq %rax, %rcx
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rsp), %rax
mulq (%r8)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rsp), %rax
mulq 8(%r8)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rsp), %rax
mulq (%r8)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rsp), %rax
mulq 8(%r8)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rsp), %rax
mulq 16(%r8)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rsp), %rax
mulq (%r8)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rsp), %rax
mulq 8(%r8)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rsp), %rax
mulq 16(%r8)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rsp), %rax
mulq 24(%r8)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rsp), %rax
mulq 8(%r8)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rsp), %rax
mulq 16(%r8)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rsp), %rax
mulq 24(%r8)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rsp), %rax
mulq 16(%r8)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rsp), %rax
mulq 24(%r8)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rsp), %rax
mulq 24(%r8)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, 32(%rsp)
movq %r9, 40(%rsp)
movq %r10, 48(%rsp)
movq %r11, 56(%rsp)
# Multiply
# A[0] * B[0]
movq 96(%rsp), %rax
mulq 128(%rsp)
movq %rax, %rcx
movq %rdx, %r9
# A[0] * B[1]
movq 104(%rsp), %rax
mulq 128(%rsp)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq 96(%rsp), %rax
mulq 136(%rsp)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 112(%rsp), %rax
mulq 128(%rsp)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 104(%rsp), %rax
mulq 136(%rsp)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq 96(%rsp), %rax
mulq 144(%rsp)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 120(%rsp), %rax
mulq 128(%rsp)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 112(%rsp), %rax
mulq 136(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 104(%rsp), %rax
mulq 144(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq 96(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 120(%rsp), %rax
mulq 136(%rsp)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 112(%rsp), %rax
mulq 144(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 104(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 120(%rsp), %rax
mulq 144(%rsp)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 112(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 120(%rsp), %rax
mulq 152(%rsp)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, (%rsp)
movq %r9, 8(%rsp)
movq %r10, 16(%rsp)
movq %r11, 24(%rsp)
decb 168(%rsp)
jge L_curve25519_x64_bits
movq $63, 168(%rsp)
decb 160(%rsp)
jge L_curve25519_x64_words
# Invert
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
movq %rsp, %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 96(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 128(%rsp), %rsi
movq $19, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 128(%rsp), %rsi
leaq 96(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 128(%rsp), %rsi
movq $0x63, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 128(%rsp), %rsi
leaq 96(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
movq 176(%rsp), %rdi
# Multiply
# A[0] * B[0]
movq (%rsp), %rax
mulq (%rdi)
movq %rax, %rcx
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rsp), %rax
mulq (%rdi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rsp), %rax
mulq 8(%rdi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rsp), %rax
mulq (%rdi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rsp), %rax
mulq 8(%rdi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rsp), %rax
mulq 16(%rdi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rsp), %rax
mulq (%rdi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rsp), %rax
mulq 8(%rdi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rsp), %rax
mulq 16(%rdi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rsp), %rax
mulq 24(%rdi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rsp), %rax
mulq 8(%rdi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rsp), %rax
mulq 16(%rdi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rsp), %rax
mulq 24(%rdi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rsp), %rax
mulq 16(%rdi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rsp), %rax
mulq 24(%rdi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rsp), %rax
mulq 24(%rdi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbp
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rbp, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %rcx
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbp, %r11
addq %rax, %rcx
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %rcx, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
xorq %rax, %rax
addq $0xb8, %rsp
popq %rbp
popq %rbx
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size curve25519_x64,.-curve25519_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_pow22523_x64
.type fe_pow22523_x64,@function
.align 4
fe_pow22523_x64:
#else
.section __TEXT,__text
.globl _fe_pow22523_x64
.p2align 2
_fe_pow22523_x64:
#endif /* __APPLE__ */
subq $0x70, %rsp
# pow22523
movq %rdi, 96(%rsp)
movq %rsi, 104(%rsp)
movq %rsp, %rdi
movq 104(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq 104(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $19, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $0x63, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_x64@plt
#else
callq _fe_sq_n_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_x64@plt
#else
callq _fe_sq_x64
#endif /* __APPLE__ */
movq 96(%rsp), %rdi
movq %rsp, %rsi
movq 104(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_x64@plt
#else
callq _fe_mul_x64
#endif /* __APPLE__ */
movq 104(%rsp), %rsi
movq 96(%rsp), %rdi
addq $0x70, %rsp
repz retq
#ifndef __APPLE__
.text
.globl fe_ge_to_p2_x64
.type fe_ge_to_p2_x64,@function
.align 4
fe_ge_to_p2_x64:
#else
.section __TEXT,__text
.globl _fe_ge_to_p2_x64
.p2align 2
_fe_ge_to_p2_x64:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $40, %rsp
movq %rsi, (%rsp)
movq %rdx, 8(%rsp)
movq %rcx, 16(%rsp)
movq %r8, 24(%rsp)
movq %r9, 32(%rsp)
movq 16(%rsp), %rsi
movq 88(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 24(%rsp), %rsi
movq 32(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 32(%rsp), %rsi
movq 88(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $40, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_to_p3_x64
.type fe_ge_to_p3_x64,@function
.align 4
fe_ge_to_p3_x64:
#else
.section __TEXT,__text
.globl _fe_ge_to_p3_x64
.p2align 2
_fe_ge_to_p3_x64:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $40, %rsp
movq %rsi, (%rsp)
movq %rdx, 8(%rsp)
movq %rcx, 16(%rsp)
movq %r8, 24(%rsp)
movq %r9, 32(%rsp)
movq 24(%rsp), %rsi
movq 96(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 32(%rsp), %rsi
movq 88(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 88(%rsp), %rsi
movq 96(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq 24(%rsp), %rsi
movq 32(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $40, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_dbl_x64
.type fe_ge_dbl_x64,@function
.align 4
fe_ge_dbl_x64:
#else
.section __TEXT,__text
.globl _fe_ge_dbl_x64
.p2align 2
_fe_ge_dbl_x64:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $0x50, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq (%rsp), %rdi
movq 32(%rsp), %rsi
# Square
# A[0] * A[1]
movq (%rsi), %rax
mulq 8(%rsi)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq (%rsi), %rax
mulq 16(%rsi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq (%rsi), %rax
mulq 24(%rsi)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 8(%rsi), %rax
mulq 16(%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 8(%rsi), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 16(%rsi), %rax
mulq 24(%rsi)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %r8
movq %rdx, %rcx
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
addq %rcx, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rcx
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
addq %rcx, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rcx
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rcx, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq 40(%rsp), %rsi
# Square
# A[0] * A[1]
movq (%rsi), %rax
mulq 8(%rsi)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq (%rsi), %rax
mulq 16(%rsi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq (%rsi), %rax
mulq 24(%rsi)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 8(%rsi), %rax
mulq 16(%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 8(%rsi), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 16(%rsi), %rax
mulq 24(%rsi)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %r8
movq %rdx, %rcx
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
addq %rcx, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rcx
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
addq %rcx, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rcx
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rcx, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
movq 128(%rsp), %rsi
# Square * 2
# A[0] * A[1]
movq (%rsi), %rax
mulq 8(%rsi)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq (%rsi), %rax
mulq 16(%rsi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq (%rsi), %rax
mulq 24(%rsi)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 8(%rsi), %rax
mulq 16(%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 8(%rsi), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 16(%rsi), %rax
mulq 24(%rsi)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %r8
movq %rdx, %rcx
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
addq %rcx, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rcx
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
addq %rcx, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rcx
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rcx, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rbx
xorq %rax, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $3, %r15, %rax
shldq $2, %r14, %r15
shldq $2, %r13, %r14
shldq $2, %r12, %r13
shldq $2, %r11, %r12
shldq $0x01, %r10, %r11
shldq $0x01, %r9, %r10
shldq $0x01, %r8, %r9
shlq $0x01, %r8
andq %rbx, %r11
# Two out left, one in right
andq %rbx, %r15
# Multiply top bits by 19*19
imulq $0x169, %rax, %rcx
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
addq %rcx, %r8
adcq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 32(%rsp), %rsi
movq 40(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
leaq 48(%rsp), %rdi
movq 8(%rsp), %rsi
# Square
# A[0] * A[1]
movq (%rsi), %rax
mulq 8(%rsi)
movq %rax, %r9
movq %rdx, %r10
# A[0] * A[2]
movq (%rsi), %rax
mulq 16(%rsi)
xorq %r11, %r11
addq %rax, %r10
adcq %rdx, %r11
# A[0] * A[3]
movq (%rsi), %rax
mulq 24(%rsi)
xorq %r12, %r12
addq %rax, %r11
adcq %rdx, %r12
# A[1] * A[2]
movq 8(%rsi), %rax
mulq 16(%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * A[3]
movq 8(%rsi), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
# A[2] * A[3]
movq 16(%rsi), %rax
mulq 24(%rsi)
xorq %r14, %r14
addq %rax, %r13
adcq %rdx, %r14
# Double
xorq %r15, %r15
addq %r9, %r9
adcq %r10, %r10
adcq %r11, %r11
adcq %r12, %r12
adcq %r13, %r13
adcq %r14, %r14
adcq $0x00, %r15
# A[0] * A[0]
movq (%rsi), %rax
mulq %rax
movq %rax, %r8
movq %rdx, %rcx
# A[1] * A[1]
movq 8(%rsi), %rax
mulq %rax
addq %rcx, %r9
adcq %rax, %r10
adcq $0x00, %rdx
movq %rdx, %rcx
# A[2] * A[2]
movq 16(%rsi), %rax
mulq %rax
addq %rcx, %r11
adcq %rax, %r12
adcq $0x00, %rdx
movq %rdx, %rcx
# A[3] * A[3]
movq 24(%rsi), %rax
mulq %rax
addq %rax, %r14
adcq %rdx, %r15
addq %rcx, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 16(%rsp), %rsi
movq (%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq 16(%rsp), %rsi
movq (%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
leaq 48(%rsp), %rsi
movq 8(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
movq 24(%rsp), %rsi
movq 16(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $0x50, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_dbl_x64,.-fe_ge_dbl_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_madd_x64
.type fe_ge_madd_x64,@function
.align 4
fe_ge_madd_x64:
#else
.section __TEXT,__text
.globl _fe_ge_madd_x64
.p2align 2
_fe_ge_madd_x64:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $0x50, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq (%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq (%rsp), %rsi
movq 152(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 8(%rsp), %rsi
movq 160(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
movq 144(%rsp), %rsi
movq 136(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
leaq 48(%rsp), %rdi
movq 128(%rsp), %rsi
movq 128(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $0x50, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_madd_x64,.-fe_ge_madd_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_msub_x64
.type fe_ge_msub_x64,@function
.align 4
fe_ge_msub_x64:
#else
.section __TEXT,__text
.globl _fe_ge_msub_x64
.p2align 2
_fe_ge_msub_x64:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $0x50, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq (%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq (%rsp), %rsi
movq 160(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 8(%rsp), %rsi
movq 152(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
movq 144(%rsp), %rsi
movq 136(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
leaq 48(%rsp), %rdi
movq 128(%rsp), %rsi
movq 128(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $0x50, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_msub_x64,.-fe_ge_msub_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_add_x64
.type fe_ge_add_x64,@function
.align 4
fe_ge_add_x64:
#else
.section __TEXT,__text
.globl _fe_ge_add_x64
.p2align 2
_fe_ge_add_x64:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $0x50, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq (%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq (%rsp), %rsi
movq 160(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 8(%rsp), %rsi
movq 168(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
movq 152(%rsp), %rsi
movq 136(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 128(%rsp), %rsi
movq 144(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
leaq 48(%rsp), %rdi
movq (%rsp), %rsi
movq (%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $0x50, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_add_x64,.-fe_ge_add_x64
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_sub_x64
.type fe_ge_sub_x64,@function
.align 4
fe_ge_sub_x64:
#else
.section __TEXT,__text
.globl _fe_ge_sub_x64
.p2align 2
_fe_ge_sub_x64:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $0x50, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq (%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 40(%rsp), %rsi
movq 32(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq (%rsp), %rsi
movq 168(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 8(%rsp), %rsi
movq 160(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
movq 152(%rsp), %rsi
movq 136(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 128(%rsp), %rsi
movq 144(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rax
mulq (%rsi)
movq %rax, %r8
movq %rdx, %r9
# A[0] * B[1]
movq 8(%rbx), %rax
mulq (%rsi)
xorq %r10, %r10
addq %rax, %r9
adcq %rdx, %r10
# A[1] * B[0]
movq (%rbx), %rax
mulq 8(%rsi)
xorq %r11, %r11
addq %rax, %r9
adcq %rdx, %r10
adcq $0x00, %r11
# A[0] * B[2]
movq 16(%rbx), %rax
mulq (%rsi)
addq %rax, %r10
adcq %rdx, %r11
# A[1] * B[1]
movq 8(%rbx), %rax
mulq 8(%rsi)
xorq %r12, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[2] * B[0]
movq (%rbx), %rax
mulq 16(%rsi)
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
# A[0] * B[3]
movq 24(%rbx), %rax
mulq (%rsi)
xorq %r13, %r13
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[2]
movq 16(%rbx), %rax
mulq 8(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[2] * B[1]
movq 8(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[3] * B[0]
movq (%rbx), %rax
mulq 24(%rsi)
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
# A[1] * B[3]
movq 24(%rbx), %rax
mulq 8(%rsi)
xorq %r14, %r14
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[2]
movq 16(%rbx), %rax
mulq 16(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[3] * B[1]
movq 8(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r12
adcq %rdx, %r13
adcq $0x00, %r14
# A[2] * B[3]
movq 24(%rbx), %rax
mulq 16(%rsi)
xorq %r15, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[2]
movq 16(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r13
adcq %rdx, %r14
adcq $0x00, %r15
# A[3] * B[3]
movq 24(%rbx), %rax
mulq 24(%rsi)
addq %rax, %r14
adcq %rdx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rax
mulq %r12
xorq %r12, %r12
addq %rax, %r8
movq $19, %rax
adcq %rdx, %r12
mulq %r13
xorq %r13, %r13
addq %rax, %r9
movq $19, %rax
adcq %rdx, %r13
mulq %r14
xorq %r14, %r14
addq %rax, %r10
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
adcq $0x00, %rdx
# Overflow
shldq $0x01, %r11, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
leaq 48(%rsp), %rdi
movq (%rsp), %rsi
movq (%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 16(%rsp), %rsi
movq 8(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rbx), %r8
movq $0x00, %rcx
sbbq 8(%rbx), %r9
movq $-19, %rax
sbbq 16(%rbx), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rbx), %r11
sbbq $0x00, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Add modulus (if underflow)
addq %rax, %r8
adcq %rcx, %r9
adcq %rcx, %r10
adcq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rdi
leaq 48(%rsp), %rsi
movq 24(%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rcx
adcq 16(%rbx), %r10
movq $-19, %rax
adcq 24(%rbx), %rcx
movq $0x7fffffffffffffff, %rdx
movq %rcx, %r11
sarq $63, %rcx
# Mask the modulus
andq %rcx, %rax
andq %rcx, %rdx
# Sub modulus (if overflow)
subq %rax, %r8
sbbq %rcx, %r9
sbbq %rcx, %r10
sbbq %rdx, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $0x50, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_sub_x64,.-fe_ge_sub_x64
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
#ifndef __APPLE__
.text
.globl fe_mul_avx2
.type fe_mul_avx2,@function
.align 4
fe_mul_avx2:
#else
.section __TEXT,__text
.globl _fe_mul_avx2
.p2align 2
_fe_mul_avx2:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbx
movq %rdx, %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rax, %rcx
xorq %r15, %r15
adcxq %rax, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rcx, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rax, %rcx
adoxq %rax, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rax, %r14
adoxq %rcx, %r10
adcxq %rax, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rax, %rcx
adcxq %r14, %r12
adoxq %rax, %r11
adcxq %r15, %r13
adoxq %rcx, %r12
# A[0] * B[2]
mulxq (%rsi), %rax, %rcx
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rax, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rax
adcxq %rcx, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rax, %r11
mulxq 24(%rsi), %rax, %rcx
adcxq %rax, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rax
adcxq %rcx, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rax, %r13
mulxq 24(%rsi), %rax, %rcx
adoxq %r15, %r14
adcxq %rax, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rax
adcxq %rcx, %r15
xorq %rcx, %rcx
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rax, %r12
mulxq 24(%rsi), %rdx, %rax
adoxq %rdx, %r11
adoxq %rax, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rax
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rax, %r14
mulxq 24(%rsi), %rax, %rdx
adcxq %rcx, %r15
adoxq %rax, %r13
adoxq %rdx, %r14
adoxq %rcx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r12, %rax, %r12
adcxq %rax, %r8
adoxq %r12, %r9
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
popq %rbx
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size fe_mul_avx2,.-fe_mul_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq_avx2
.type fe_sq_avx2,@function
.align 4
fe_sq_avx2:
#else
.section __TEXT,__text
.globl _fe_sq_avx2
.p2align 2
_fe_sq_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
# Square
# A[0] * A[1]
movq (%rsi), %rdx
mulxq 8(%rsi), %r9, %r10
# A[0] * A[3]
mulxq 24(%rsi), %r11, %r12
# A[2] * A[1]
movq 16(%rsi), %rdx
mulxq 8(%rsi), %rcx, %rbx
xorq %r15, %r15
adoxq %rcx, %r11
# A[2] * A[3]
mulxq 24(%rsi), %r13, %r14
adoxq %rbx, %r12
# A[2] * A[0]
mulxq (%rsi), %rcx, %rbx
adoxq %r15, %r13
adcxq %rcx, %r10
adoxq %r15, %r14
# A[1] * A[3]
movq 8(%rsi), %rdx
mulxq 24(%rsi), %rax, %r8
adcxq %rbx, %r11
adcxq %rax, %r12
adcxq %r8, %r13
adcxq %r15, %r14
# Double with Carry Flag
xorq %r15, %r15
# A[0] * A[0]
movq (%rsi), %rdx
mulxq %rdx, %r8, %rax
adcxq %r9, %r9
# A[1] * A[1]
movq 8(%rsi), %rdx
mulxq %rdx, %rcx, %rbx
adcxq %r10, %r10
adoxq %rax, %r9
adcxq %r11, %r11
adoxq %rcx, %r10
# A[2] * A[2]
movq 16(%rsi), %rdx
mulxq %rdx, %rax, %rcx
adcxq %r12, %r12
adoxq %rbx, %r11
adcxq %r13, %r13
adoxq %rax, %r12
# A[3] * A[3]
movq 24(%rsi), %rdx
mulxq %rdx, %rax, %rbx
adcxq %r14, %r14
adoxq %rcx, %r13
adcxq %r15, %r15
adoxq %rax, %r14
adoxq %rbx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r12, %rax, %r12
adcxq %rax, %r8
adoxq %r12, %r9
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_sq_avx2,.-fe_sq_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq_n_avx2
.type fe_sq_n_avx2,@function
.align 4
fe_sq_n_avx2:
#else
.section __TEXT,__text
.globl _fe_sq_n_avx2
.p2align 2
_fe_sq_n_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbp
movq %rdx, %rbp
L_fe_sq_n_avx2:
# Square
# A[0] * A[1]
movq (%rsi), %rdx
mulxq 8(%rsi), %r9, %r10
# A[0] * A[3]
mulxq 24(%rsi), %r11, %r12
# A[2] * A[1]
movq 16(%rsi), %rdx
mulxq 8(%rsi), %rcx, %rbx
xorq %r15, %r15
adoxq %rcx, %r11
# A[2] * A[3]
mulxq 24(%rsi), %r13, %r14
adoxq %rbx, %r12
# A[2] * A[0]
mulxq (%rsi), %rcx, %rbx
adoxq %r15, %r13
adcxq %rcx, %r10
adoxq %r15, %r14
# A[1] * A[3]
movq 8(%rsi), %rdx
mulxq 24(%rsi), %rax, %r8
adcxq %rbx, %r11
adcxq %rax, %r12
adcxq %r8, %r13
adcxq %r15, %r14
# Double with Carry Flag
xorq %r15, %r15
# A[0] * A[0]
movq (%rsi), %rdx
mulxq %rdx, %r8, %rax
adcxq %r9, %r9
# A[1] * A[1]
movq 8(%rsi), %rdx
mulxq %rdx, %rcx, %rbx
adcxq %r10, %r10
adoxq %rax, %r9
adcxq %r11, %r11
adoxq %rcx, %r10
# A[2] * A[2]
movq 16(%rsi), %rdx
mulxq %rdx, %rax, %rcx
adcxq %r12, %r12
adoxq %rbx, %r11
adcxq %r13, %r13
adoxq %rax, %r12
# A[3] * A[3]
movq 24(%rsi), %rdx
mulxq %rdx, %rax, %rbx
adcxq %r14, %r14
adoxq %rcx, %r13
adcxq %r15, %r15
adoxq %rax, %r14
adoxq %rbx, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r12, %rax, %r12
adcxq %rax, %r8
adoxq %r12, %r9
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
decb %bpl
jnz L_fe_sq_n_avx2
popq %rbp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_sq_n_avx2,.-fe_sq_n_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_mul121666_avx2
.type fe_mul121666_avx2,@function
.align 4
fe_mul121666_avx2:
#else
.section __TEXT,__text
.globl _fe_mul121666_avx2
.p2align 2
_fe_mul121666_avx2:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
movq $0x1db42, %rdx
mulxq (%rsi), %rax, %r13
mulxq 8(%rsi), %rcx, %r12
mulxq 16(%rsi), %r8, %r11
mulxq 24(%rsi), %r9, %r10
addq %r13, %rcx
adcq %r12, %r8
adcq %r11, %r9
adcq $0x00, %r10
movq $0x7fffffffffffffff, %r13
shldq $0x01, %r9, %r10
andq %r13, %r9
imulq $19, %r10, %r10
addq %r10, %rax
adcq $0x00, %rcx
adcq $0x00, %r8
adcq $0x00, %r9
movq %rax, (%rdi)
movq %rcx, 8(%rdi)
movq %r8, 16(%rdi)
movq %r9, 24(%rdi)
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size fe_mul121666_avx2,.-fe_mul121666_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_sq2_avx2
.type fe_sq2_avx2,@function
.align 4
fe_sq2_avx2:
#else
.section __TEXT,__text
.globl _fe_sq2_avx2
.p2align 2
_fe_sq2_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
# Square * 2
# A[0] * A[1]
movq (%rsi), %rdx
mulxq 8(%rsi), %r9, %r10
# A[0] * A[3]
mulxq 24(%rsi), %r11, %r12
# A[2] * A[1]
movq 16(%rsi), %rdx
mulxq 8(%rsi), %rcx, %rbx
xorq %r15, %r15
adoxq %rcx, %r11
# A[2] * A[3]
mulxq 24(%rsi), %r13, %r14
adoxq %rbx, %r12
# A[2] * A[0]
mulxq (%rsi), %rcx, %rbx
adoxq %r15, %r13
adcxq %rcx, %r10
adoxq %r15, %r14
# A[1] * A[3]
movq 8(%rsi), %rdx
mulxq 24(%rsi), %rax, %r8
adcxq %rbx, %r11
adcxq %rax, %r12
adcxq %r8, %r13
adcxq %r15, %r14
# Double with Carry Flag
xorq %r15, %r15
# A[0] * A[0]
movq (%rsi), %rdx
mulxq %rdx, %r8, %rax
adcxq %r9, %r9
# A[1] * A[1]
movq 8(%rsi), %rdx
mulxq %rdx, %rcx, %rbx
adcxq %r10, %r10
adoxq %rax, %r9
adcxq %r11, %r11
adoxq %rcx, %r10
# A[2] * A[2]
movq 16(%rsi), %rdx
mulxq %rdx, %rax, %rcx
adcxq %r12, %r12
adoxq %rbx, %r11
adcxq %r13, %r13
adoxq %rax, %r12
# A[3] * A[3]
movq 24(%rsi), %rdx
mulxq %rdx, %rax, %rbx
adcxq %r14, %r14
adoxq %rcx, %r13
adcxq %r15, %r15
adoxq %rax, %r14
adoxq %rbx, %r15
# Reduce
movq $0x7fffffffffffffff, %rbx
xorq %rax, %rax
# Move top half into t4-t7 and remove top bit from t3 and double
shldq $3, %r15, %rax
shldq $2, %r14, %r15
shldq $2, %r13, %r14
shldq $2, %r12, %r13
shldq $2, %r11, %r12
shldq $0x01, %r10, %r11
shldq $0x01, %r9, %r10
shldq $0x01, %r8, %r9
shlq $0x01, %r8
andq %rbx, %r11
# Two out left, one in right
andq %rbx, %r15
# Multiply top bits by 19*19
imulq $0x169, %rax, %rcx
xorq %rbx, %rbx
# Multiply top half by 19
movq $19, %rdx
adoxq %rcx, %r8
mulxq %r12, %rax, %r12
adcxq %rax, %r8
adoxq %r12, %r9
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rbx, %rdx
adcxq %rbx, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rbx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rbx, %r11
addq %rax, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_sq2_avx2,.-fe_sq2_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_invert_avx2
.type fe_invert_avx2,@function
.align 4
fe_invert_avx2:
#else
.section __TEXT,__text
.globl _fe_invert_avx2
.p2align 2
_fe_invert_avx2:
#endif /* __APPLE__ */
subq $0x90, %rsp
# Invert
movq %rdi, 128(%rsp)
movq %rsi, 136(%rsp)
movq %rsp, %rdi
movq 136(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq 136(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $19, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $0x63, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
movq 128(%rsp), %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
movq 136(%rsp), %rsi
movq 128(%rsp), %rdi
addq $0x90, %rsp
repz retq
#ifndef __APPLE__
.text
.globl curve25519_avx2
.type curve25519_avx2,@function
.align 4
curve25519_avx2:
#else
.section __TEXT,__text
.globl _curve25519_avx2
.p2align 2
_curve25519_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbp
movq %rdx, %r8
subq $0xc0, %rsp
movq $0x00, 184(%rsp)
movq %rdi, 176(%rsp)
# Set one
movq $0x01, (%rdi)
movq $0x00, 8(%rdi)
movq $0x00, 16(%rdi)
movq $0x00, 24(%rdi)
# Set zero
movq $0x00, (%rsp)
movq $0x00, 8(%rsp)
movq $0x00, 16(%rsp)
movq $0x00, 24(%rsp)
# Set one
movq $0x01, 32(%rsp)
movq $0x00, 40(%rsp)
movq $0x00, 48(%rsp)
movq $0x00, 56(%rsp)
# Copy
movq (%r8), %r9
movq 8(%r8), %r10
movq 16(%r8), %r11
movq 24(%r8), %r12
movq %r9, 64(%rsp)
movq %r10, 72(%rsp)
movq %r11, 80(%rsp)
movq %r12, 88(%rsp)
movb $62, 168(%rsp)
movq $3, 160(%rsp)
L_curve25519_avx2_words:
L_curve25519_avx2_bits:
movq 184(%rsp), %rbx
movq 160(%rsp), %r9
movb 168(%rsp), %cl
movq (%rsi,%r9,8), %rax
shrq %cl, %rax
andq $0x01, %rax
xorq %rax, %rbx
negq %rbx
# Conditional Swap
movq (%rdi), %r9
movq 8(%rdi), %r10
movq 16(%rdi), %r11
movq 24(%rdi), %r12
xorq 64(%rsp), %r9
xorq 72(%rsp), %r10
xorq 80(%rsp), %r11
xorq 88(%rsp), %r12
andq %rbx, %r9
andq %rbx, %r10
andq %rbx, %r11
andq %rbx, %r12
xorq %r9, (%rdi)
xorq %r10, 8(%rdi)
xorq %r11, 16(%rdi)
xorq %r12, 24(%rdi)
xorq %r9, 64(%rsp)
xorq %r10, 72(%rsp)
xorq %r11, 80(%rsp)
xorq %r12, 88(%rsp)
# Conditional Swap
movq (%rsp), %r9
movq 8(%rsp), %r10
movq 16(%rsp), %r11
movq 24(%rsp), %r12
xorq 32(%rsp), %r9
xorq 40(%rsp), %r10
xorq 48(%rsp), %r11
xorq 56(%rsp), %r12
andq %rbx, %r9
andq %rbx, %r10
andq %rbx, %r11
andq %rbx, %r12
xorq %r9, (%rsp)
xorq %r10, 8(%rsp)
xorq %r11, 16(%rsp)
xorq %r12, 24(%rsp)
xorq %r9, 32(%rsp)
xorq %r10, 40(%rsp)
xorq %r11, 48(%rsp)
xorq %r12, 56(%rsp)
movq %rax, 184(%rsp)
# Add
movq (%rdi), %r9
movq 8(%rdi), %r10
movq 16(%rdi), %r11
movq 24(%rdi), %rax
movq %r9, %r13
addq (%rsp), %r9
movq %r10, %r14
adcq 8(%rsp), %r10
movq %r11, %r15
adcq 16(%rsp), %r11
movq %rax, %rbp
adcq 24(%rsp), %rax
movq $-19, %rcx
movq %rax, %r12
movq $0x7fffffffffffffff, %rbx
sarq $63, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Sub modulus (if overflow)
subq %rcx, %r9
sbbq %rax, %r10
sbbq %rax, %r11
sbbq %rbx, %r12
# Sub
subq (%rsp), %r13
movq $0x00, %rax
sbbq 8(%rsp), %r14
movq $-19, %rcx
sbbq 16(%rsp), %r15
movq $0x7fffffffffffffff, %rbx
sbbq 24(%rsp), %rbp
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r13
adcq %rax, %r14
adcq %rax, %r15
adcq %rbx, %rbp
movq %r9, (%rdi)
movq %r10, 8(%rdi)
movq %r11, 16(%rdi)
movq %r12, 24(%rdi)
movq %r13, 128(%rsp)
movq %r14, 136(%rsp)
movq %r15, 144(%rsp)
movq %rbp, 152(%rsp)
# Add
movq 64(%rsp), %r9
movq 72(%rsp), %r10
movq 80(%rsp), %r11
movq 88(%rsp), %rax
movq %r9, %r13
addq 32(%rsp), %r9
movq %r10, %r14
adcq 40(%rsp), %r10
movq %r11, %r15
adcq 48(%rsp), %r11
movq %rax, %rbp
adcq 56(%rsp), %rax
movq $-19, %rcx
movq %rax, %r12
movq $0x7fffffffffffffff, %rbx
sarq $63, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Sub modulus (if overflow)
subq %rcx, %r9
sbbq %rax, %r10
sbbq %rax, %r11
sbbq %rbx, %r12
# Sub
subq 32(%rsp), %r13
movq $0x00, %rax
sbbq 40(%rsp), %r14
movq $-19, %rcx
sbbq 48(%rsp), %r15
movq $0x7fffffffffffffff, %rbx
sbbq 56(%rsp), %rbp
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r13
adcq %rax, %r14
adcq %rax, %r15
adcq %rbx, %rbp
movq %r9, (%rsp)
movq %r10, 8(%rsp)
movq %r11, 16(%rsp)
movq %r12, 24(%rsp)
movq %r13, 96(%rsp)
movq %r14, 104(%rsp)
movq %r15, 112(%rsp)
movq %rbp, 120(%rsp)
# Multiply
# A[0] * B[0]
movq (%rdi), %rdx
mulxq 96(%rsp), %r9, %r10
# A[2] * B[0]
mulxq 112(%rsp), %r11, %r12
# A[1] * B[0]
mulxq 104(%rsp), %rcx, %rbx
xorq %rbp, %rbp
adcxq %rcx, %r10
# A[1] * B[3]
movq 24(%rdi), %rdx
mulxq 104(%rsp), %r13, %r14
adcxq %rbx, %r11
# A[0] * B[1]
movq 8(%rdi), %rdx
mulxq 96(%rsp), %rcx, %rbx
adoxq %rcx, %r10
# A[2] * B[1]
mulxq 112(%rsp), %rcx, %r15
adoxq %rbx, %r11
adcxq %rcx, %r12
# A[1] * B[2]
movq 16(%rdi), %rdx
mulxq 104(%rsp), %rcx, %rbx
adcxq %r15, %r13
adoxq %rcx, %r12
adcxq %rbp, %r14
adoxq %rbx, %r13
# A[0] * B[2]
mulxq 96(%rsp), %rcx, %rbx
adoxq %rbp, %r14
xorq %r15, %r15
adcxq %rcx, %r11
# A[1] * B[1]
movq 8(%rdi), %rdx
mulxq 104(%rsp), %rdx, %rcx
adcxq %rbx, %r12
adoxq %rdx, %r11
# A[3] * B[1]
movq 8(%rdi), %rdx
adoxq %rcx, %r12
mulxq 120(%rsp), %rcx, %rbx
adcxq %rcx, %r13
# A[2] * B[2]
movq 16(%rdi), %rdx
mulxq 112(%rsp), %rdx, %rcx
adcxq %rbx, %r14
adoxq %rdx, %r13
# A[3] * B[3]
movq 24(%rdi), %rdx
adoxq %rcx, %r14
mulxq 120(%rsp), %rcx, %rbx
adoxq %rbp, %r15
adcxq %rcx, %r15
# A[0] * B[3]
mulxq 96(%rsp), %rdx, %rcx
adcxq %rbx, %rbp
xorq %rbx, %rbx
adcxq %rdx, %r12
# A[3] * B[0]
movq (%rdi), %rdx
adcxq %rcx, %r13
mulxq 120(%rsp), %rdx, %rcx
adoxq %rdx, %r12
adoxq %rcx, %r13
# A[2] * B[3]
movq 24(%rdi), %rdx
mulxq 112(%rsp), %rdx, %rcx
adcxq %rdx, %r14
# A[3] * B[2]
movq 16(%rdi), %rdx
adcxq %rcx, %r15
mulxq 120(%rsp), %rcx, %rdx
adcxq %rbx, %rbp
adoxq %rcx, %r14
adoxq %rdx, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rbx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rbx, %rbx
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %rcx, %r15
adcxq %rcx, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rbx, %rdx
adcxq %rbx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rbx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, 32(%rsp)
movq %r10, 40(%rsp)
movq %r11, 48(%rsp)
movq %r12, 56(%rsp)
# Multiply
# A[0] * B[0]
movq 128(%rsp), %rdx
mulxq (%rsp), %r9, %r10
# A[2] * B[0]
mulxq 16(%rsp), %r11, %r12
# A[1] * B[0]
mulxq 8(%rsp), %rcx, %rbx
xorq %rbp, %rbp
adcxq %rcx, %r10
# A[1] * B[3]
movq 152(%rsp), %rdx
mulxq 8(%rsp), %r13, %r14
adcxq %rbx, %r11
# A[0] * B[1]
movq 136(%rsp), %rdx
mulxq (%rsp), %rcx, %rbx
adoxq %rcx, %r10
# A[2] * B[1]
mulxq 16(%rsp), %rcx, %r15
adoxq %rbx, %r11
adcxq %rcx, %r12
# A[1] * B[2]
movq 144(%rsp), %rdx
mulxq 8(%rsp), %rcx, %rbx
adcxq %r15, %r13
adoxq %rcx, %r12
adcxq %rbp, %r14
adoxq %rbx, %r13
# A[0] * B[2]
mulxq (%rsp), %rcx, %rbx
adoxq %rbp, %r14
xorq %r15, %r15
adcxq %rcx, %r11
# A[1] * B[1]
movq 136(%rsp), %rdx
mulxq 8(%rsp), %rdx, %rcx
adcxq %rbx, %r12
adoxq %rdx, %r11
# A[3] * B[1]
movq 136(%rsp), %rdx
adoxq %rcx, %r12
mulxq 24(%rsp), %rcx, %rbx
adcxq %rcx, %r13
# A[2] * B[2]
movq 144(%rsp), %rdx
mulxq 16(%rsp), %rdx, %rcx
adcxq %rbx, %r14
adoxq %rdx, %r13
# A[3] * B[3]
movq 152(%rsp), %rdx
adoxq %rcx, %r14
mulxq 24(%rsp), %rcx, %rbx
adoxq %rbp, %r15
adcxq %rcx, %r15
# A[0] * B[3]
mulxq (%rsp), %rdx, %rcx
adcxq %rbx, %rbp
xorq %rbx, %rbx
adcxq %rdx, %r12
# A[3] * B[0]
movq 128(%rsp), %rdx
adcxq %rcx, %r13
mulxq 24(%rsp), %rdx, %rcx
adoxq %rdx, %r12
adoxq %rcx, %r13
# A[2] * B[3]
movq 152(%rsp), %rdx
mulxq 16(%rsp), %rdx, %rcx
adcxq %rdx, %r14
# A[3] * B[2]
movq 144(%rsp), %rdx
adcxq %rcx, %r15
mulxq 24(%rsp), %rcx, %rdx
adcxq %rbx, %rbp
adoxq %rcx, %r14
adoxq %rdx, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rbx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rbx, %rbx
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %rcx, %r15
adcxq %rcx, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rbx, %rdx
adcxq %rbx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rbx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, (%rsp)
movq %r10, 8(%rsp)
movq %r11, 16(%rsp)
movq %r12, 24(%rsp)
# Square
# A[0] * A[1]
movq 128(%rsp), %rdx
mulxq 136(%rsp), %r10, %r11
# A[0] * A[3]
mulxq 152(%rsp), %r12, %r13
# A[2] * A[1]
movq 144(%rsp), %rdx
mulxq 136(%rsp), %rcx, %rbx
xorq %rbp, %rbp
adoxq %rcx, %r12
# A[2] * A[3]
mulxq 152(%rsp), %r14, %r15
adoxq %rbx, %r13
# A[2] * A[0]
mulxq 128(%rsp), %rcx, %rbx
adoxq %rbp, %r14
adcxq %rcx, %r11
adoxq %rbp, %r15
# A[1] * A[3]
movq 136(%rsp), %rdx
mulxq 152(%rsp), %rax, %r9
adcxq %rbx, %r12
adcxq %rax, %r13
adcxq %r9, %r14
adcxq %rbp, %r15
# Double with Carry Flag
xorq %rbp, %rbp
# A[0] * A[0]
movq 128(%rsp), %rdx
mulxq %rdx, %r9, %rax
adcxq %r10, %r10
# A[1] * A[1]
movq 136(%rsp), %rdx
mulxq %rdx, %rcx, %rbx
adcxq %r11, %r11
adoxq %rax, %r10
adcxq %r12, %r12
adoxq %rcx, %r11
# A[2] * A[2]
movq 144(%rsp), %rdx
mulxq %rdx, %rax, %rcx
adcxq %r13, %r13
adoxq %rbx, %r12
adcxq %r14, %r14
adoxq %rax, %r13
# A[3] * A[3]
movq 152(%rsp), %rdx
mulxq %rdx, %rax, %rbx
adcxq %r15, %r15
adoxq %rcx, %r14
adcxq %rbp, %rbp
adoxq %rax, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rcx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %rax, %r15
adcxq %rax, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, 96(%rsp)
movq %r10, 104(%rsp)
movq %r11, 112(%rsp)
movq %r12, 120(%rsp)
# Square
# A[0] * A[1]
movq (%rdi), %rdx
mulxq 8(%rdi), %r10, %r11
# A[0] * A[3]
mulxq 24(%rdi), %r12, %r13
# A[2] * A[1]
movq 16(%rdi), %rdx
mulxq 8(%rdi), %rcx, %rbx
xorq %rbp, %rbp
adoxq %rcx, %r12
# A[2] * A[3]
mulxq 24(%rdi), %r14, %r15
adoxq %rbx, %r13
# A[2] * A[0]
mulxq (%rdi), %rcx, %rbx
adoxq %rbp, %r14
adcxq %rcx, %r11
adoxq %rbp, %r15
# A[1] * A[3]
movq 8(%rdi), %rdx
mulxq 24(%rdi), %rax, %r9
adcxq %rbx, %r12
adcxq %rax, %r13
adcxq %r9, %r14
adcxq %rbp, %r15
# Double with Carry Flag
xorq %rbp, %rbp
# A[0] * A[0]
movq (%rdi), %rdx
mulxq %rdx, %r9, %rax
adcxq %r10, %r10
# A[1] * A[1]
movq 8(%rdi), %rdx
mulxq %rdx, %rcx, %rbx
adcxq %r11, %r11
adoxq %rax, %r10
adcxq %r12, %r12
adoxq %rcx, %r11
# A[2] * A[2]
movq 16(%rdi), %rdx
mulxq %rdx, %rax, %rcx
adcxq %r13, %r13
adoxq %rbx, %r12
adcxq %r14, %r14
adoxq %rax, %r13
# A[3] * A[3]
movq 24(%rdi), %rdx
mulxq %rdx, %rax, %rbx
adcxq %r15, %r15
adoxq %rcx, %r14
adcxq %rbp, %rbp
adoxq %rax, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rcx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %rax, %r15
adcxq %rax, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, 128(%rsp)
movq %r10, 136(%rsp)
movq %r11, 144(%rsp)
movq %r12, 152(%rsp)
# Add
movq 32(%rsp), %r9
movq 40(%rsp), %r10
movq 48(%rsp), %r11
movq 56(%rsp), %rax
movq %r9, %r13
addq (%rsp), %r9
movq %r10, %r14
adcq 8(%rsp), %r10
movq %r11, %r15
adcq 16(%rsp), %r11
movq %rax, %rbp
adcq 24(%rsp), %rax
movq $-19, %rcx
movq %rax, %r12
movq $0x7fffffffffffffff, %rbx
sarq $63, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Sub modulus (if overflow)
subq %rcx, %r9
sbbq %rax, %r10
sbbq %rax, %r11
sbbq %rbx, %r12
# Sub
subq (%rsp), %r13
movq $0x00, %rax
sbbq 8(%rsp), %r14
movq $-19, %rcx
sbbq 16(%rsp), %r15
movq $0x7fffffffffffffff, %rbx
sbbq 24(%rsp), %rbp
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r13
adcq %rax, %r14
adcq %rax, %r15
adcq %rbx, %rbp
movq %r9, 64(%rsp)
movq %r10, 72(%rsp)
movq %r11, 80(%rsp)
movq %r12, 88(%rsp)
movq %r13, (%rsp)
movq %r14, 8(%rsp)
movq %r15, 16(%rsp)
movq %rbp, 24(%rsp)
# Multiply
# A[0] * B[0]
movq 96(%rsp), %rdx
mulxq 128(%rsp), %r9, %r10
# A[2] * B[0]
mulxq 144(%rsp), %r11, %r12
# A[1] * B[0]
mulxq 136(%rsp), %rcx, %rbx
xorq %rbp, %rbp
adcxq %rcx, %r10
# A[1] * B[3]
movq 120(%rsp), %rdx
mulxq 136(%rsp), %r13, %r14
adcxq %rbx, %r11
# A[0] * B[1]
movq 104(%rsp), %rdx
mulxq 128(%rsp), %rcx, %rbx
adoxq %rcx, %r10
# A[2] * B[1]
mulxq 144(%rsp), %rcx, %r15
adoxq %rbx, %r11
adcxq %rcx, %r12
# A[1] * B[2]
movq 112(%rsp), %rdx
mulxq 136(%rsp), %rcx, %rbx
adcxq %r15, %r13
adoxq %rcx, %r12
adcxq %rbp, %r14
adoxq %rbx, %r13
# A[0] * B[2]
mulxq 128(%rsp), %rcx, %rbx
adoxq %rbp, %r14
xorq %r15, %r15
adcxq %rcx, %r11
# A[1] * B[1]
movq 104(%rsp), %rdx
mulxq 136(%rsp), %rdx, %rcx
adcxq %rbx, %r12
adoxq %rdx, %r11
# A[3] * B[1]
movq 104(%rsp), %rdx
adoxq %rcx, %r12
mulxq 152(%rsp), %rcx, %rbx
adcxq %rcx, %r13
# A[2] * B[2]
movq 112(%rsp), %rdx
mulxq 144(%rsp), %rdx, %rcx
adcxq %rbx, %r14
adoxq %rdx, %r13
# A[3] * B[3]
movq 120(%rsp), %rdx
adoxq %rcx, %r14
mulxq 152(%rsp), %rcx, %rbx
adoxq %rbp, %r15
adcxq %rcx, %r15
# A[0] * B[3]
mulxq 128(%rsp), %rdx, %rcx
adcxq %rbx, %rbp
xorq %rbx, %rbx
adcxq %rdx, %r12
# A[3] * B[0]
movq 96(%rsp), %rdx
adcxq %rcx, %r13
mulxq 152(%rsp), %rdx, %rcx
adoxq %rdx, %r12
adoxq %rcx, %r13
# A[2] * B[3]
movq 120(%rsp), %rdx
mulxq 144(%rsp), %rdx, %rcx
adcxq %rdx, %r14
# A[3] * B[2]
movq 112(%rsp), %rdx
adcxq %rcx, %r15
mulxq 152(%rsp), %rcx, %rdx
adcxq %rbx, %rbp
adoxq %rcx, %r14
adoxq %rdx, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rbx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rbx, %rbx
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %rcx, %r15
adcxq %rcx, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rbx, %rdx
adcxq %rbx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rbx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, (%rdi)
movq %r10, 8(%rdi)
movq %r11, 16(%rdi)
movq %r12, 24(%rdi)
# Sub
movq 128(%rsp), %r9
movq 136(%rsp), %r10
movq 144(%rsp), %r11
movq 152(%rsp), %r12
subq 96(%rsp), %r9
movq $0x00, %rax
sbbq 104(%rsp), %r10
movq $-19, %rcx
sbbq 112(%rsp), %r11
movq $0x7fffffffffffffff, %rbx
sbbq 120(%rsp), %r12
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r9
adcq %rax, %r10
adcq %rax, %r11
adcq %rbx, %r12
movq %r9, 128(%rsp)
movq %r10, 136(%rsp)
movq %r11, 144(%rsp)
movq %r12, 152(%rsp)
# Square
# A[0] * A[1]
movq (%rsp), %rdx
mulxq 8(%rsp), %r10, %r11
# A[0] * A[3]
mulxq 24(%rsp), %r12, %r13
# A[2] * A[1]
movq 16(%rsp), %rdx
mulxq 8(%rsp), %rcx, %rbx
xorq %rbp, %rbp
adoxq %rcx, %r12
# A[2] * A[3]
mulxq 24(%rsp), %r14, %r15
adoxq %rbx, %r13
# A[2] * A[0]
mulxq (%rsp), %rcx, %rbx
adoxq %rbp, %r14
adcxq %rcx, %r11
adoxq %rbp, %r15
# A[1] * A[3]
movq 8(%rsp), %rdx
mulxq 24(%rsp), %rax, %r9
adcxq %rbx, %r12
adcxq %rax, %r13
adcxq %r9, %r14
adcxq %rbp, %r15
# Double with Carry Flag
xorq %rbp, %rbp
# A[0] * A[0]
movq (%rsp), %rdx
mulxq %rdx, %r9, %rax
adcxq %r10, %r10
# A[1] * A[1]
movq 8(%rsp), %rdx
mulxq %rdx, %rcx, %rbx
adcxq %r11, %r11
adoxq %rax, %r10
adcxq %r12, %r12
adoxq %rcx, %r11
# A[2] * A[2]
movq 16(%rsp), %rdx
mulxq %rdx, %rax, %rcx
adcxq %r13, %r13
adoxq %rbx, %r12
adcxq %r14, %r14
adoxq %rax, %r13
# A[3] * A[3]
movq 24(%rsp), %rdx
mulxq %rdx, %rax, %rbx
adcxq %r15, %r15
adoxq %rcx, %r14
adcxq %rbp, %rbp
adoxq %rax, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rcx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %rax, %r15
adcxq %rax, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, (%rsp)
movq %r10, 8(%rsp)
movq %r11, 16(%rsp)
movq %r12, 24(%rsp)
movq $0x1db42, %rdx
mulxq 128(%rsp), %r9, %rbp
mulxq 136(%rsp), %r10, %r15
mulxq 144(%rsp), %r11, %r14
mulxq 152(%rsp), %r12, %r13
addq %rbp, %r10
adcq %r15, %r11
adcq %r14, %r12
adcq $0x00, %r13
movq $0x7fffffffffffffff, %rbp
shldq $0x01, %r12, %r13
andq %rbp, %r12
imulq $19, %r13, %r13
addq %r13, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
movq %r9, 32(%rsp)
movq %r10, 40(%rsp)
movq %r11, 48(%rsp)
movq %r12, 56(%rsp)
# Square
# A[0] * A[1]
movq 64(%rsp), %rdx
mulxq 72(%rsp), %r10, %r11
# A[0] * A[3]
mulxq 88(%rsp), %r12, %r13
# A[2] * A[1]
movq 80(%rsp), %rdx
mulxq 72(%rsp), %rcx, %rbx
xorq %rbp, %rbp
adoxq %rcx, %r12
# A[2] * A[3]
mulxq 88(%rsp), %r14, %r15
adoxq %rbx, %r13
# A[2] * A[0]
mulxq 64(%rsp), %rcx, %rbx
adoxq %rbp, %r14
adcxq %rcx, %r11
adoxq %rbp, %r15
# A[1] * A[3]
movq 72(%rsp), %rdx
mulxq 88(%rsp), %rax, %r9
adcxq %rbx, %r12
adcxq %rax, %r13
adcxq %r9, %r14
adcxq %rbp, %r15
# Double with Carry Flag
xorq %rbp, %rbp
# A[0] * A[0]
movq 64(%rsp), %rdx
mulxq %rdx, %r9, %rax
adcxq %r10, %r10
# A[1] * A[1]
movq 72(%rsp), %rdx
mulxq %rdx, %rcx, %rbx
adcxq %r11, %r11
adoxq %rax, %r10
adcxq %r12, %r12
adoxq %rcx, %r11
# A[2] * A[2]
movq 80(%rsp), %rdx
mulxq %rdx, %rax, %rcx
adcxq %r13, %r13
adoxq %rbx, %r12
adcxq %r14, %r14
adoxq %rax, %r13
# A[3] * A[3]
movq 88(%rsp), %rdx
mulxq %rdx, %rax, %rbx
adcxq %r15, %r15
adoxq %rcx, %r14
adcxq %rbp, %rbp
adoxq %rax, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rcx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r13, %rax, %r13
adcxq %rax, %r9
adoxq %r13, %r10
mulxq %r14, %rax, %r14
adcxq %rax, %r10
adoxq %r14, %r11
mulxq %r15, %rax, %r15
adcxq %rax, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rax
andq %rcx, %r12
addq %rax, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, 64(%rsp)
movq %r10, 72(%rsp)
movq %r11, 80(%rsp)
movq %r12, 88(%rsp)
# Add
movq 96(%rsp), %r9
movq 104(%rsp), %r10
addq 32(%rsp), %r9
movq 112(%rsp), %r11
adcq 40(%rsp), %r10
movq 120(%rsp), %rax
adcq 48(%rsp), %r11
movq $-19, %rcx
adcq 56(%rsp), %rax
movq $0x7fffffffffffffff, %rbx
movq %rax, %r12
sarq $63, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Sub modulus (if overflow)
subq %rcx, %r9
sbbq %rax, %r10
sbbq %rax, %r11
sbbq %rbx, %r12
movq %r9, 96(%rsp)
movq %r10, 104(%rsp)
movq %r11, 112(%rsp)
movq %r12, 120(%rsp)
# Multiply
# A[0] * B[0]
movq (%rsp), %rdx
mulxq (%r8), %r9, %r10
# A[2] * B[0]
mulxq 16(%r8), %r11, %r12
# A[1] * B[0]
mulxq 8(%r8), %rcx, %rbx
xorq %rbp, %rbp
adcxq %rcx, %r10
# A[1] * B[3]
movq 24(%rsp), %rdx
mulxq 8(%r8), %r13, %r14
adcxq %rbx, %r11
# A[0] * B[1]
movq 8(%rsp), %rdx
mulxq (%r8), %rcx, %rbx
adoxq %rcx, %r10
# A[2] * B[1]
mulxq 16(%r8), %rcx, %r15
adoxq %rbx, %r11
adcxq %rcx, %r12
# A[1] * B[2]
movq 16(%rsp), %rdx
mulxq 8(%r8), %rcx, %rbx
adcxq %r15, %r13
adoxq %rcx, %r12
adcxq %rbp, %r14
adoxq %rbx, %r13
# A[0] * B[2]
mulxq (%r8), %rcx, %rbx
adoxq %rbp, %r14
xorq %r15, %r15
adcxq %rcx, %r11
# A[1] * B[1]
movq 8(%rsp), %rdx
mulxq 8(%r8), %rdx, %rcx
adcxq %rbx, %r12
adoxq %rdx, %r11
# A[3] * B[1]
movq 8(%rsp), %rdx
adoxq %rcx, %r12
mulxq 24(%r8), %rcx, %rbx
adcxq %rcx, %r13
# A[2] * B[2]
movq 16(%rsp), %rdx
mulxq 16(%r8), %rdx, %rcx
adcxq %rbx, %r14
adoxq %rdx, %r13
# A[3] * B[3]
movq 24(%rsp), %rdx
adoxq %rcx, %r14
mulxq 24(%r8), %rcx, %rbx
adoxq %rbp, %r15
adcxq %rcx, %r15
# A[0] * B[3]
mulxq (%r8), %rdx, %rcx
adcxq %rbx, %rbp
xorq %rbx, %rbx
adcxq %rdx, %r12
# A[3] * B[0]
movq (%rsp), %rdx
adcxq %rcx, %r13
mulxq 24(%r8), %rdx, %rcx
adoxq %rdx, %r12
adoxq %rcx, %r13
# A[2] * B[3]
movq 24(%rsp), %rdx
mulxq 16(%r8), %rdx, %rcx
adcxq %rdx, %r14
# A[3] * B[2]
movq 16(%rsp), %rdx
adcxq %rcx, %r15
mulxq 24(%r8), %rcx, %rdx
adcxq %rbx, %rbp
adoxq %rcx, %r14
adoxq %rdx, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rbx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rbx, %rbx
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %rcx, %r15
adcxq %rcx, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rbx, %rdx
adcxq %rbx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rbx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, 32(%rsp)
movq %r10, 40(%rsp)
movq %r11, 48(%rsp)
movq %r12, 56(%rsp)
# Multiply
# A[0] * B[0]
movq 96(%rsp), %rdx
mulxq 128(%rsp), %r9, %r10
# A[2] * B[0]
mulxq 144(%rsp), %r11, %r12
# A[1] * B[0]
mulxq 136(%rsp), %rcx, %rbx
xorq %rbp, %rbp
adcxq %rcx, %r10
# A[1] * B[3]
movq 120(%rsp), %rdx
mulxq 136(%rsp), %r13, %r14
adcxq %rbx, %r11
# A[0] * B[1]
movq 104(%rsp), %rdx
mulxq 128(%rsp), %rcx, %rbx
adoxq %rcx, %r10
# A[2] * B[1]
mulxq 144(%rsp), %rcx, %r15
adoxq %rbx, %r11
adcxq %rcx, %r12
# A[1] * B[2]
movq 112(%rsp), %rdx
mulxq 136(%rsp), %rcx, %rbx
adcxq %r15, %r13
adoxq %rcx, %r12
adcxq %rbp, %r14
adoxq %rbx, %r13
# A[0] * B[2]
mulxq 128(%rsp), %rcx, %rbx
adoxq %rbp, %r14
xorq %r15, %r15
adcxq %rcx, %r11
# A[1] * B[1]
movq 104(%rsp), %rdx
mulxq 136(%rsp), %rdx, %rcx
adcxq %rbx, %r12
adoxq %rdx, %r11
# A[3] * B[1]
movq 104(%rsp), %rdx
adoxq %rcx, %r12
mulxq 152(%rsp), %rcx, %rbx
adcxq %rcx, %r13
# A[2] * B[2]
movq 112(%rsp), %rdx
mulxq 144(%rsp), %rdx, %rcx
adcxq %rbx, %r14
adoxq %rdx, %r13
# A[3] * B[3]
movq 120(%rsp), %rdx
adoxq %rcx, %r14
mulxq 152(%rsp), %rcx, %rbx
adoxq %rbp, %r15
adcxq %rcx, %r15
# A[0] * B[3]
mulxq 128(%rsp), %rdx, %rcx
adcxq %rbx, %rbp
xorq %rbx, %rbx
adcxq %rdx, %r12
# A[3] * B[0]
movq 96(%rsp), %rdx
adcxq %rcx, %r13
mulxq 152(%rsp), %rdx, %rcx
adoxq %rdx, %r12
adoxq %rcx, %r13
# A[2] * B[3]
movq 120(%rsp), %rdx
mulxq 144(%rsp), %rdx, %rcx
adcxq %rdx, %r14
# A[3] * B[2]
movq 112(%rsp), %rdx
adcxq %rcx, %r15
mulxq 152(%rsp), %rcx, %rdx
adcxq %rbx, %rbp
adoxq %rcx, %r14
adoxq %rdx, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rbx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rbx, %rbx
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %rcx, %r15
adcxq %rcx, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rbx, %rdx
adcxq %rbx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rbx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, (%rsp)
movq %r10, 8(%rsp)
movq %r11, 16(%rsp)
movq %r12, 24(%rsp)
decb 168(%rsp)
jge L_curve25519_avx2_bits
movq $63, 168(%rsp)
decb 160(%rsp)
jge L_curve25519_avx2_words
# Invert
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
movq %rsp, %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 96(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 128(%rsp), %rsi
movq $19, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 128(%rsp), %rsi
leaq 96(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $9, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 128(%rsp), %rdi
leaq 128(%rsp), %rsi
movq $0x63, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 128(%rsp), %rsi
leaq 96(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 96(%rsp), %rdi
leaq 96(%rsp), %rsi
movq $49, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 96(%rsp), %rsi
leaq 64(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movq $4, %rdx
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
movq 176(%rsp), %rdi
# Multiply
# A[0] * B[0]
movq (%rsp), %rdx
mulxq (%rdi), %r9, %r10
# A[2] * B[0]
mulxq 16(%rdi), %r11, %r12
# A[1] * B[0]
mulxq 8(%rdi), %rcx, %rbx
xorq %rbp, %rbp
adcxq %rcx, %r10
# A[1] * B[3]
movq 24(%rsp), %rdx
mulxq 8(%rdi), %r13, %r14
adcxq %rbx, %r11
# A[0] * B[1]
movq 8(%rsp), %rdx
mulxq (%rdi), %rcx, %rbx
adoxq %rcx, %r10
# A[2] * B[1]
mulxq 16(%rdi), %rcx, %r15
adoxq %rbx, %r11
adcxq %rcx, %r12
# A[1] * B[2]
movq 16(%rsp), %rdx
mulxq 8(%rdi), %rcx, %rbx
adcxq %r15, %r13
adoxq %rcx, %r12
adcxq %rbp, %r14
adoxq %rbx, %r13
# A[0] * B[2]
mulxq (%rdi), %rcx, %rbx
adoxq %rbp, %r14
xorq %r15, %r15
adcxq %rcx, %r11
# A[1] * B[1]
movq 8(%rsp), %rdx
mulxq 8(%rdi), %rdx, %rcx
adcxq %rbx, %r12
adoxq %rdx, %r11
# A[3] * B[1]
movq 8(%rsp), %rdx
adoxq %rcx, %r12
mulxq 24(%rdi), %rcx, %rbx
adcxq %rcx, %r13
# A[2] * B[2]
movq 16(%rsp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rbx, %r14
adoxq %rdx, %r13
# A[3] * B[3]
movq 24(%rsp), %rdx
adoxq %rcx, %r14
mulxq 24(%rdi), %rcx, %rbx
adoxq %rbp, %r15
adcxq %rcx, %r15
# A[0] * B[3]
mulxq (%rdi), %rdx, %rcx
adcxq %rbx, %rbp
xorq %rbx, %rbx
adcxq %rdx, %r12
# A[3] * B[0]
movq (%rsp), %rdx
adcxq %rcx, %r13
mulxq 24(%rdi), %rdx, %rcx
adoxq %rdx, %r12
adoxq %rcx, %r13
# A[2] * B[3]
movq 24(%rsp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rdx, %r14
# A[3] * B[2]
movq 16(%rsp), %rdx
adcxq %rcx, %r15
mulxq 24(%rdi), %rcx, %rdx
adcxq %rbx, %rbp
adoxq %rcx, %r14
adoxq %rdx, %r15
adoxq %rbx, %rbp
# Reduce
movq $0x7fffffffffffffff, %rbx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r15, %rbp
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
andq %rbx, %r12
# Multiply top half by 19
movq $19, %rdx
xorq %rbx, %rbx
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %rcx, %r15
adcxq %rcx, %r11
adoxq %r15, %r12
mulxq %rbp, %rbp, %rdx
adcxq %rbp, %r12
adoxq %rbx, %rdx
adcxq %rbx, %rdx
# Overflow
shldq $0x01, %r12, %rdx
movq $0x7fffffffffffffff, %rbx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Reduce if top bit set
movq %r12, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rbx, %r12
addq %rcx, %r9
adcq $0x00, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Store
movq %r9, (%rdi)
movq %r10, 8(%rdi)
movq %r11, 16(%rdi)
movq %r12, 24(%rdi)
xorq %rax, %rax
addq $0xc0, %rsp
popq %rbp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size curve25519_avx2,.-curve25519_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_pow22523_avx2
.type fe_pow22523_avx2,@function
.align 4
fe_pow22523_avx2:
#else
.section __TEXT,__text
.globl _fe_pow22523_avx2
.p2align 2
_fe_pow22523_avx2:
#endif /* __APPLE__ */
subq $0x70, %rsp
# pow22523
movq %rdi, 96(%rsp)
movq %rsi, 104(%rsp)
movq %rsp, %rdi
movq 104(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq 104(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movb $4, %dl
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movb $9, %dl
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movb $19, %dl
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movb $9, %dl
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movb $49, %dl
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 64(%rsp), %rdi
leaq 64(%rsp), %rsi
movb $0x63, %dl
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 64(%rsp), %rsi
leaq 32(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
leaq 32(%rsp), %rdi
leaq 32(%rsp), %rsi
movb $49, %dl
#ifndef __APPLE__
callq fe_sq_n_avx2@plt
#else
callq _fe_sq_n_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
leaq 32(%rsp), %rsi
movq %rsp, %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
movq %rsp, %rdi
movq %rsp, %rsi
#ifndef __APPLE__
callq fe_sq_avx2@plt
#else
callq _fe_sq_avx2
#endif /* __APPLE__ */
movq 96(%rsp), %rdi
movq %rsp, %rsi
movq 104(%rsp), %rdx
#ifndef __APPLE__
callq fe_mul_avx2@plt
#else
callq _fe_mul_avx2
#endif /* __APPLE__ */
movq 104(%rsp), %rsi
movq 96(%rsp), %rdi
addq $0x70, %rsp
repz retq
#ifndef __APPLE__
.text
.globl fe_ge_to_p2_avx2
.type fe_ge_to_p2_avx2,@function
.align 4
fe_ge_to_p2_avx2:
#else
.section __TEXT,__text
.globl _fe_ge_to_p2_avx2
.p2align 2
_fe_ge_to_p2_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $40, %rsp
movq %rsi, (%rsp)
movq %rdx, 8(%rsp)
movq %rcx, 16(%rsp)
movq %r8, 24(%rsp)
movq %r9, 32(%rsp)
movq 16(%rsp), %rsi
movq 88(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 24(%rsp), %rsi
movq 32(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 88(%rsp), %rsi
# Multiply
# A[0] * B[0]
movq (%rsi), %rdx
mulxq (%rbx), %r8, %r9
# A[2] * B[0]
mulxq 16(%rbx), %r10, %r11
# A[1] * B[0]
mulxq 8(%rbx), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rsi), %rdx
mulxq 8(%rbx), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rsi), %rdx
mulxq (%rbx), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rbx), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rsi), %rdx
mulxq 8(%rbx), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rbx), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rsi), %rdx
mulxq 8(%rbx), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rsi), %rdx
adoxq %rcx, %r11
mulxq 24(%rbx), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rsi), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rsi), %rdx
adoxq %rcx, %r13
mulxq 24(%rbx), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rbx), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rsi), %rdx
adcxq %rcx, %r12
mulxq 24(%rbx), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rsi), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rsi), %rdx
adcxq %rcx, %r14
mulxq 24(%rbx), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $40, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_to_p3_avx2
.type fe_ge_to_p3_avx2,@function
.align 4
fe_ge_to_p3_avx2:
#else
.section __TEXT,__text
.globl _fe_ge_to_p3_avx2
.p2align 2
_fe_ge_to_p3_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $40, %rsp
movq %rsi, (%rsp)
movq %rdx, 8(%rsp)
movq %rcx, 16(%rsp)
movq %r8, 24(%rsp)
movq %r9, 32(%rsp)
movq 24(%rsp), %rsi
movq 96(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq (%rsp), %rdi
movq 32(%rsp), %rsi
movq 88(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq 96(%rsp), %rsi
# Multiply
# A[0] * B[0]
movq (%rsi), %rdx
mulxq (%rbx), %r8, %r9
# A[2] * B[0]
mulxq 16(%rbx), %r10, %r11
# A[1] * B[0]
mulxq 8(%rbx), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rsi), %rdx
mulxq 8(%rbx), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rsi), %rdx
mulxq (%rbx), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rbx), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rsi), %rdx
mulxq 8(%rbx), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rbx), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rsi), %rdx
mulxq 8(%rbx), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rsi), %rdx
adoxq %rcx, %r11
mulxq 24(%rbx), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rsi), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rsi), %rdx
adoxq %rcx, %r13
mulxq 24(%rbx), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rbx), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rsi), %rdx
adcxq %rcx, %r12
mulxq 24(%rbx), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rsi), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rsi), %rdx
adcxq %rcx, %r14
mulxq 24(%rbx), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq 24(%rsp), %rsi
movq 32(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
addq $40, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_dbl_avx2
.type fe_ge_dbl_avx2,@function
.align 4
fe_ge_dbl_avx2:
#else
.section __TEXT,__text
.globl _fe_ge_dbl_avx2
.p2align 2
_fe_ge_dbl_avx2:
#endif /* __APPLE__ */
pushq %rbp
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $48, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq 32(%rsp), %rsi
# Square
# A[0] * A[1]
movq (%rsi), %rdx
mulxq 8(%rsi), %r9, %r10
# A[0] * A[3]
mulxq 24(%rsi), %r11, %r12
# A[2] * A[1]
movq 16(%rsi), %rdx
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adoxq %rcx, %r11
# A[2] * A[3]
mulxq 24(%rsi), %r13, %r14
adoxq %rax, %r12
# A[2] * A[0]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
adcxq %rcx, %r10
adoxq %r15, %r14
# A[1] * A[3]
movq 8(%rsi), %rdx
mulxq 24(%rsi), %rbp, %r8
adcxq %rax, %r11
adcxq %rbp, %r12
adcxq %r8, %r13
adcxq %r15, %r14
# Double with Carry Flag
xorq %r15, %r15
# A[0] * A[0]
movq (%rsi), %rdx
mulxq %rdx, %r8, %rbp
adcxq %r9, %r9
# A[1] * A[1]
movq 8(%rsi), %rdx
mulxq %rdx, %rcx, %rax
adcxq %r10, %r10
adoxq %rbp, %r9
adcxq %r11, %r11
adoxq %rcx, %r10
# A[2] * A[2]
movq 16(%rsi), %rdx
mulxq %rdx, %rbp, %rcx
adcxq %r12, %r12
adoxq %rax, %r11
adcxq %r13, %r13
adoxq %rbp, %r12
# A[3] * A[3]
movq 24(%rsi), %rdx
mulxq %rdx, %rbp, %rax
adcxq %r14, %r14
adoxq %rcx, %r13
adcxq %r15, %r15
adoxq %rbp, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r12, %rbp, %r12
adcxq %rbp, %r8
adoxq %r12, %r9
mulxq %r13, %rbp, %r13
adcxq %rbp, %r9
adoxq %r13, %r10
mulxq %r14, %rbp, %r14
adcxq %rbp, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rbp
andq %rcx, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rbp
andq %rcx, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 16(%rsp), %rdi
movq 40(%rsp), %rbx
# Square
# A[0] * A[1]
movq (%rbx), %rdx
mulxq 8(%rbx), %r9, %r10
# A[0] * A[3]
mulxq 24(%rbx), %r11, %r12
# A[2] * A[1]
movq 16(%rbx), %rdx
mulxq 8(%rbx), %rcx, %rax
xorq %r15, %r15
adoxq %rcx, %r11
# A[2] * A[3]
mulxq 24(%rbx), %r13, %r14
adoxq %rax, %r12
# A[2] * A[0]
mulxq (%rbx), %rcx, %rax
adoxq %r15, %r13
adcxq %rcx, %r10
adoxq %r15, %r14
# A[1] * A[3]
movq 8(%rbx), %rdx
mulxq 24(%rbx), %rbp, %r8
adcxq %rax, %r11
adcxq %rbp, %r12
adcxq %r8, %r13
adcxq %r15, %r14
# Double with Carry Flag
xorq %r15, %r15
# A[0] * A[0]
movq (%rbx), %rdx
mulxq %rdx, %r8, %rbp
adcxq %r9, %r9
# A[1] * A[1]
movq 8(%rbx), %rdx
mulxq %rdx, %rcx, %rax
adcxq %r10, %r10
adoxq %rbp, %r9
adcxq %r11, %r11
adoxq %rcx, %r10
# A[2] * A[2]
movq 16(%rbx), %rdx
mulxq %rdx, %rbp, %rcx
adcxq %r12, %r12
adoxq %rax, %r11
adcxq %r13, %r13
adoxq %rbp, %r12
# A[3] * A[3]
movq 24(%rbx), %rdx
mulxq %rdx, %rbp, %rax
adcxq %r14, %r14
adoxq %rcx, %r13
adcxq %r15, %r15
adoxq %rbp, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r12, %rbp, %r12
adcxq %rbp, %r8
adoxq %r12, %r9
mulxq %r13, %rbp, %r13
adcxq %rbp, %r9
adoxq %r13, %r10
mulxq %r14, %rbp, %r14
adcxq %rbp, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rbp
andq %rcx, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rbp
andq %rcx, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
addq (%rbx), %r8
movq 16(%rsi), %r10
adcq 8(%rbx), %r9
movq 24(%rsi), %rdx
adcq 16(%rbx), %r10
movq $-19, %rcx
adcq 24(%rbx), %rdx
movq $0x7fffffffffffffff, %rax
movq %rdx, %r11
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 24(%rsp), %rsi
# Square
# A[0] * A[1]
movq (%rdi), %rdx
mulxq 8(%rdi), %r9, %r10
# A[0] * A[3]
mulxq 24(%rdi), %r11, %r12
# A[2] * A[1]
movq 16(%rdi), %rdx
mulxq 8(%rdi), %rcx, %rax
xorq %r15, %r15
adoxq %rcx, %r11
# A[2] * A[3]
mulxq 24(%rdi), %r13, %r14
adoxq %rax, %r12
# A[2] * A[0]
mulxq (%rdi), %rcx, %rax
adoxq %r15, %r13
adcxq %rcx, %r10
adoxq %r15, %r14
# A[1] * A[3]
movq 8(%rdi), %rdx
mulxq 24(%rdi), %rbp, %r8
adcxq %rax, %r11
adcxq %rbp, %r12
adcxq %r8, %r13
adcxq %r15, %r14
# Double with Carry Flag
xorq %r15, %r15
# A[0] * A[0]
movq (%rdi), %rdx
mulxq %rdx, %r8, %rbp
adcxq %r9, %r9
# A[1] * A[1]
movq 8(%rdi), %rdx
mulxq %rdx, %rcx, %rax
adcxq %r10, %r10
adoxq %rbp, %r9
adcxq %r11, %r11
adoxq %rcx, %r10
# A[2] * A[2]
movq 16(%rdi), %rdx
mulxq %rdx, %rbp, %rcx
adcxq %r12, %r12
adoxq %rax, %r11
adcxq %r13, %r13
adoxq %rbp, %r12
# A[3] * A[3]
movq 24(%rdi), %rdx
mulxq %rdx, %rbp, %rax
adcxq %r14, %r14
adoxq %rcx, %r13
adcxq %r15, %r15
adoxq %rbp, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rcx
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rcx, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rcx, %rcx
mulxq %r12, %rbp, %r12
adcxq %rbp, %r8
adoxq %r12, %r9
mulxq %r13, %rbp, %r13
adcxq %rbp, %r9
adoxq %r13, %r10
mulxq %r14, %rbp, %r14
adcxq %rbp, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rcx, %rdx
adcxq %rcx, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rcx
imulq $19, %rdx, %rbp
andq %rcx, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rbp
andq %rcx, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 16(%rsp), %rsi
movq (%rsp), %rbx
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %rdx
movq %r8, %r12
addq (%rbx), %r8
movq %r9, %r13
adcq 8(%rbx), %r9
movq %r10, %r14
adcq 16(%rbx), %r10
movq %rdx, %r15
adcq 24(%rbx), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rbx), %r12
movq $0x00, %rdx
sbbq 8(%rbx), %r13
movq $-19, %rcx
sbbq 16(%rbx), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rbx), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rsi)
movq %r13, 8(%rsi)
movq %r14, 16(%rsi)
movq %r15, 24(%rsi)
movq 24(%rsp), %rsi
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rdi), %r8
movq $0x00, %rdx
sbbq 8(%rdi), %r9
movq $-19, %rcx
sbbq 16(%rdi), %r10
movq $0x7fffffffffffffff, %rax
sbbq 24(%rdi), %r11
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r8
adcq %rdx, %r9
adcq %rdx, %r10
adcq %rax, %r11
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq 104(%rsp), %rdi
# Square * 2
# A[0] * A[1]
movq (%rdi), %rdx
mulxq 8(%rdi), %r9, %r10
# A[0] * A[3]
mulxq 24(%rdi), %r11, %r12
# A[2] * A[1]
movq 16(%rdi), %rdx
mulxq 8(%rdi), %rcx, %rax
xorq %r15, %r15
adoxq %rcx, %r11
# A[2] * A[3]
mulxq 24(%rdi), %r13, %r14
adoxq %rax, %r12
# A[2] * A[0]
mulxq (%rdi), %rcx, %rax
adoxq %r15, %r13
adcxq %rcx, %r10
adoxq %r15, %r14
# A[1] * A[3]
movq 8(%rdi), %rdx
mulxq 24(%rdi), %rbp, %r8
adcxq %rax, %r11
adcxq %rbp, %r12
adcxq %r8, %r13
adcxq %r15, %r14
# Double with Carry Flag
xorq %r15, %r15
# A[0] * A[0]
movq (%rdi), %rdx
mulxq %rdx, %r8, %rbp
adcxq %r9, %r9
# A[1] * A[1]
movq 8(%rdi), %rdx
mulxq %rdx, %rcx, %rax
adcxq %r10, %r10
adoxq %rbp, %r9
adcxq %r11, %r11
adoxq %rcx, %r10
# A[2] * A[2]
movq 16(%rdi), %rdx
mulxq %rdx, %rbp, %rcx
adcxq %r12, %r12
adoxq %rax, %r11
adcxq %r13, %r13
adoxq %rbp, %r12
# A[3] * A[3]
movq 24(%rdi), %rdx
mulxq %rdx, %rbp, %rax
adcxq %r14, %r14
adoxq %rcx, %r13
adcxq %r15, %r15
adoxq %rbp, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
xorq %rbp, %rbp
# Move top half into t4-t7 and remove top bit from t3 and double
shldq $3, %r15, %rbp
shldq $2, %r14, %r15
shldq $2, %r13, %r14
shldq $2, %r12, %r13
shldq $2, %r11, %r12
shldq $0x01, %r10, %r11
shldq $0x01, %r9, %r10
shldq $0x01, %r8, %r9
shlq $0x01, %r8
andq %rax, %r11
# Two out left, one in right
andq %rax, %r15
# Multiply top bits by 19*19
imulq $0x169, %rbp, %rcx
xorq %rax, %rax
# Multiply top half by 19
movq $19, %rdx
adoxq %rcx, %r8
mulxq %r12, %rbp, %r12
adcxq %rbp, %r8
adoxq %r12, %r9
mulxq %r13, %rbp, %r13
adcxq %rbp, %r9
adoxq %r13, %r10
mulxq %r14, %rbp, %r14
adcxq %rbp, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rbp
andq %rax, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rbp
andq %rax, %r11
addq %rbp, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 16(%rsp), %rdi
# Sub
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %r11
subq (%rdi), %r8
movq $0x00, %rdx
sbbq 8(%rdi), %r9
movq $-19, %rcx
sbbq 16(%rdi), %r10
movq $0x7fffffffffffffff, %rax
sbbq 24(%rdi), %r11
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r8
adcq %rdx, %r9
adcq %rdx, %r10
adcq %rax, %r11
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
addq $48, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rbp
repz retq
#ifndef __APPLE__
.size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_madd_avx2
.type fe_ge_madd_avx2,@function
.align 4
fe_ge_madd_avx2:
#else
.section __TEXT,__text
.globl _fe_ge_madd_avx2
.p2align 2
_fe_ge_madd_avx2:
#endif /* __APPLE__ */
pushq %rbp
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $48, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq 8(%rsp), %rsi
movq 40(%rsp), %rbx
movq 32(%rsp), %rbp
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rbp), %r8
movq %r9, %r13
adcq 8(%rbp), %r9
movq %r10, %r14
adcq 16(%rbp), %r10
movq %rdx, %r15
adcq 24(%rbp), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rbp), %r12
movq $0x00, %rdx
sbbq 8(%rbp), %r13
movq $-19, %rcx
sbbq 16(%rbp), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rbp), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rsi)
movq %r13, 8(%rsi)
movq %r14, 16(%rsi)
movq %r15, 24(%rsi)
movq 16(%rsp), %rbx
movq 128(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rdi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rdi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rdi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rdi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rdi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rdi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rdi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rdi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rdi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rdi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rdi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rdi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rdi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rdi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq 136(%rsp), %rdi
# Multiply
# A[0] * B[0]
movq (%rdi), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rdi), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rdi), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rdi), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rdi), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rdi), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rdi), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rdi), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rdi), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rdi), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rdi), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 24(%rsp), %rdi
movq 120(%rsp), %rsi
movq 112(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rdi
movq (%rsp), %rsi
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rdi), %r8
movq %r9, %r13
adcq 8(%rdi), %r9
movq %r10, %r14
adcq 16(%rdi), %r10
movq %rdx, %r15
adcq 24(%rdi), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rdi), %r12
movq $0x00, %rdx
sbbq 8(%rdi), %r13
movq $-19, %rcx
sbbq 16(%rdi), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rdi), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rsi)
movq %r13, 8(%rsi)
movq %r14, 16(%rsi)
movq %r15, 24(%rsi)
movq 104(%rsp), %rdi
# Double
movq (%rdi), %r8
movq 8(%rdi), %r9
addq %r8, %r8
movq 16(%rdi), %r10
adcq %r9, %r9
movq 24(%rdi), %rdx
adcq %r10, %r10
movq $-19, %rcx
adcq %rdx, %rdx
movq $0x7fffffffffffffff, %rax
movq %rdx, %r11
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq 24(%rsp), %rdi
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rdi), %r8
movq %r9, %r13
adcq 8(%rdi), %r9
movq %r10, %r14
adcq 16(%rdi), %r10
movq %rdx, %r15
adcq 24(%rdi), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rdi), %r12
movq $0x00, %rdx
sbbq 8(%rdi), %r13
movq $-19, %rcx
sbbq 16(%rdi), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rdi), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq %r12, (%rdi)
movq %r13, 8(%rdi)
movq %r14, 16(%rdi)
movq %r15, 24(%rdi)
addq $48, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rbp
repz retq
#ifndef __APPLE__
.size fe_ge_madd_avx2,.-fe_ge_madd_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_msub_avx2
.type fe_ge_msub_avx2,@function
.align 4
fe_ge_msub_avx2:
#else
.section __TEXT,__text
.globl _fe_ge_msub_avx2
.p2align 2
_fe_ge_msub_avx2:
#endif /* __APPLE__ */
pushq %rbp
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $48, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq 8(%rsp), %rsi
movq 40(%rsp), %rbx
movq 32(%rsp), %rbp
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rbp), %r8
movq %r9, %r13
adcq 8(%rbp), %r9
movq %r10, %r14
adcq 16(%rbp), %r10
movq %rdx, %r15
adcq 24(%rbp), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rbp), %r12
movq $0x00, %rdx
sbbq 8(%rbp), %r13
movq $-19, %rcx
sbbq 16(%rbp), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rbp), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rsi)
movq %r13, 8(%rsi)
movq %r14, 16(%rsi)
movq %r15, 24(%rsi)
movq 16(%rsp), %rbx
movq 136(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rdi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rdi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rdi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rdi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rdi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rdi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rdi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rdi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rdi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rdi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rdi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rdi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rdi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rdi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq 128(%rsp), %rdi
# Multiply
# A[0] * B[0]
movq (%rdi), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rdi), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rdi), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rdi), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rdi), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rdi), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rdi), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rdi), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rdi), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rdi), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rdi), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 24(%rsp), %rdi
movq 120(%rsp), %rsi
movq 112(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq 8(%rsp), %rsi
movq (%rsp), %rbp
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rsi), %r8
movq %r9, %r13
adcq 8(%rsi), %r9
movq %r10, %r14
adcq 16(%rsi), %r10
movq %rdx, %r15
adcq 24(%rsi), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rsi), %r12
movq $0x00, %rdx
sbbq 8(%rsi), %r13
movq $-19, %rcx
sbbq 16(%rsi), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rsi), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq %r12, (%rbp)
movq %r13, 8(%rbp)
movq %r14, 16(%rbp)
movq %r15, 24(%rbp)
movq 104(%rsp), %rsi
# Double
movq (%rsi), %r8
movq 8(%rsi), %r9
addq %r8, %r8
movq 16(%rsi), %r10
adcq %r9, %r9
movq 24(%rsi), %rdx
adcq %r10, %r10
movq $-19, %rcx
adcq %rdx, %rdx
movq $0x7fffffffffffffff, %rax
movq %rdx, %r11
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rdi), %r8
movq %r9, %r13
adcq 8(%rdi), %r9
movq %r10, %r14
adcq 16(%rdi), %r10
movq %rdx, %r15
adcq 24(%rdi), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rdi), %r12
movq $0x00, %rdx
sbbq 8(%rdi), %r13
movq $-19, %rcx
sbbq 16(%rdi), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rdi), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rbx)
movq %r13, 8(%rbx)
movq %r14, 16(%rbx)
movq %r15, 24(%rbx)
addq $48, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rbp
repz retq
#ifndef __APPLE__
.size fe_ge_msub_avx2,.-fe_ge_msub_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_add_avx2
.type fe_ge_add_avx2,@function
.align 4
fe_ge_add_avx2:
#else
.section __TEXT,__text
.globl _fe_ge_add_avx2
.p2align 2
_fe_ge_add_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $0x50, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq 8(%rsp), %rsi
movq 40(%rsp), %rbx
movq 32(%rsp), %rbp
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rbp), %r8
movq %r9, %r13
adcq 8(%rbp), %r9
movq %r10, %r14
adcq 16(%rbp), %r10
movq %rdx, %r15
adcq 24(%rbp), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rbp), %r12
movq $0x00, %rdx
sbbq 8(%rbp), %r13
movq $-19, %rcx
sbbq 16(%rbp), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rbp), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rsi)
movq %r13, 8(%rsi)
movq %r14, 16(%rsi)
movq %r15, 24(%rsi)
movq 16(%rsp), %rbx
movq 168(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rdi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rdi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rdi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rdi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rdi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rdi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rdi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rdi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rdi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rdi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rdi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rdi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rdi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rdi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq 176(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 24(%rsp), %rsi
movq 160(%rsp), %rbx
movq 144(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rbx), %r8, %r9
# A[2] * B[0]
mulxq 16(%rbx), %r10, %r11
# A[1] * B[0]
mulxq 8(%rbx), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rbx), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rbx), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rbx), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rbx), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rbx), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rbx), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rbx), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rbx), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rbx), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rbx), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rbx), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 136(%rsp), %rsi
movq 152(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
leaq 48(%rsp), %rsi
# Double
movq (%rdi), %r8
movq 8(%rdi), %r9
addq %r8, %r8
movq 16(%rdi), %r10
adcq %r9, %r9
movq 24(%rdi), %rdx
adcq %r10, %r10
movq $-19, %rcx
adcq %rdx, %rdx
movq $0x7fffffffffffffff, %rax
movq %rdx, %r11
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 8(%rsp), %rbx
movq 16(%rsp), %rbp
# Add
movq (%rbp), %r8
movq 8(%rbp), %r9
movq 16(%rbp), %r10
movq 24(%rbp), %rdx
movq %r8, %r12
addq (%rbx), %r8
movq %r9, %r13
adcq 8(%rbx), %r9
movq %r10, %r14
adcq 16(%rbx), %r10
movq %rdx, %r15
adcq 24(%rbx), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rbx), %r12
movq $0x00, %rdx
sbbq 8(%rbx), %r13
movq $-19, %rcx
sbbq 16(%rbx), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rbx), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq %r12, (%rdi)
movq %r13, 8(%rdi)
movq %r14, 16(%rdi)
movq %r15, 24(%rdi)
movq 24(%rsp), %rdi
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %rdx
movq %r8, %r12
addq (%rdi), %r8
movq %r9, %r13
adcq 8(%rdi), %r9
movq %r10, %r14
adcq 16(%rdi), %r10
movq %rdx, %r15
adcq 24(%rdi), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rdi), %r12
movq $0x00, %rdx
sbbq 8(%rdi), %r13
movq $-19, %rcx
sbbq 16(%rdi), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rdi), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rbp)
movq %r9, 8(%rbp)
movq %r10, 16(%rbp)
movq %r11, 24(%rbp)
movq %r12, (%rdi)
movq %r13, 8(%rdi)
movq %r14, 16(%rdi)
movq %r15, 24(%rdi)
addq $0x50, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_add_avx2,.-fe_ge_add_avx2
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl fe_ge_sub_avx2
.type fe_ge_sub_avx2,@function
.align 4
fe_ge_sub_avx2:
#else
.section __TEXT,__text
.globl _fe_ge_sub_avx2
.p2align 2
_fe_ge_sub_avx2:
#endif /* __APPLE__ */
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $0x50, %rsp
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movq 8(%rsp), %rsi
movq 40(%rsp), %rbx
movq 32(%rsp), %rbp
# Add
movq (%rbx), %r8
movq 8(%rbx), %r9
movq 16(%rbx), %r10
movq 24(%rbx), %rdx
movq %r8, %r12
addq (%rbp), %r8
movq %r9, %r13
adcq 8(%rbp), %r9
movq %r10, %r14
adcq 16(%rbp), %r10
movq %rdx, %r15
adcq 24(%rbp), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rbp), %r12
movq $0x00, %rdx
sbbq 8(%rbp), %r13
movq $-19, %rcx
sbbq 16(%rbp), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rbp), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rsi)
movq %r13, 8(%rsi)
movq %r14, 16(%rsi)
movq %r15, 24(%rsi)
movq 16(%rsp), %rbx
movq 176(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rdi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rdi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rdi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rdi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rdi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rdi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rdi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rdi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rdi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rdi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rdi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rdi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rdi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rdi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rdi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq 168(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 24(%rsp), %rsi
movq 160(%rsp), %rbx
movq 144(%rsp), %rbp
# Multiply
# A[0] * B[0]
movq (%rbp), %rdx
mulxq (%rbx), %r8, %r9
# A[2] * B[0]
mulxq 16(%rbx), %r10, %r11
# A[1] * B[0]
mulxq 8(%rbx), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbp), %rdx
mulxq 8(%rbx), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbp), %rdx
mulxq (%rbx), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rbx), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbp), %rdx
mulxq 8(%rbx), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rbx), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbp), %rdx
mulxq 8(%rbx), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbp), %rdx
adoxq %rcx, %r11
mulxq 24(%rbx), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbp), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbp), %rdx
adoxq %rcx, %r13
mulxq 24(%rbx), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rbx), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbp), %rdx
adcxq %rcx, %r12
mulxq 24(%rbx), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbp), %rdx
mulxq 16(%rbx), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbp), %rdx
adcxq %rcx, %r14
mulxq 24(%rbx), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 136(%rsp), %rsi
movq 152(%rsp), %rbx
# Multiply
# A[0] * B[0]
movq (%rbx), %rdx
mulxq (%rsi), %r8, %r9
# A[2] * B[0]
mulxq 16(%rsi), %r10, %r11
# A[1] * B[0]
mulxq 8(%rsi), %rcx, %rax
xorq %r15, %r15
adcxq %rcx, %r9
# A[1] * B[3]
movq 24(%rbx), %rdx
mulxq 8(%rsi), %r12, %r13
adcxq %rax, %r10
# A[0] * B[1]
movq 8(%rbx), %rdx
mulxq (%rsi), %rcx, %rax
adoxq %rcx, %r9
# A[2] * B[1]
mulxq 16(%rsi), %rcx, %r14
adoxq %rax, %r10
adcxq %rcx, %r11
# A[1] * B[2]
movq 16(%rbx), %rdx
mulxq 8(%rsi), %rcx, %rax
adcxq %r14, %r12
adoxq %rcx, %r11
adcxq %r15, %r13
adoxq %rax, %r12
# A[0] * B[2]
mulxq (%rsi), %rcx, %rax
adoxq %r15, %r13
xorq %r14, %r14
adcxq %rcx, %r10
# A[1] * B[1]
movq 8(%rbx), %rdx
mulxq 8(%rsi), %rdx, %rcx
adcxq %rax, %r11
adoxq %rdx, %r10
# A[3] * B[1]
movq 8(%rbx), %rdx
adoxq %rcx, %r11
mulxq 24(%rsi), %rcx, %rax
adcxq %rcx, %r12
# A[2] * B[2]
movq 16(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rax, %r13
adoxq %rdx, %r12
# A[3] * B[3]
movq 24(%rbx), %rdx
adoxq %rcx, %r13
mulxq 24(%rsi), %rcx, %rax
adoxq %r15, %r14
adcxq %rcx, %r14
# A[0] * B[3]
mulxq (%rsi), %rdx, %rcx
adcxq %rax, %r15
xorq %rax, %rax
adcxq %rdx, %r11
# A[3] * B[0]
movq (%rbx), %rdx
adcxq %rcx, %r12
mulxq 24(%rsi), %rdx, %rcx
adoxq %rdx, %r11
adoxq %rcx, %r12
# A[2] * B[3]
movq 24(%rbx), %rdx
mulxq 16(%rsi), %rdx, %rcx
adcxq %rdx, %r13
# A[3] * B[2]
movq 16(%rbx), %rdx
adcxq %rcx, %r14
mulxq 24(%rsi), %rcx, %rdx
adcxq %rax, %r15
adoxq %rcx, %r13
adoxq %rdx, %r14
adoxq %rax, %r15
# Reduce
movq $0x7fffffffffffffff, %rax
# Move top half into t4-t7 and remove top bit from t3
shldq $0x01, %r14, %r15
shldq $0x01, %r13, %r14
shldq $0x01, %r12, %r13
shldq $0x01, %r11, %r12
andq %rax, %r11
# Multiply top half by 19
movq $19, %rdx
xorq %rax, %rax
mulxq %r12, %rcx, %r12
adcxq %rcx, %r8
adoxq %r12, %r9
mulxq %r13, %rcx, %r13
adcxq %rcx, %r9
adoxq %r13, %r10
mulxq %r14, %rcx, %r14
adcxq %rcx, %r10
adoxq %r14, %r11
mulxq %r15, %r15, %rdx
adcxq %r15, %r11
adoxq %rax, %rdx
adcxq %rax, %rdx
# Overflow
shldq $0x01, %r11, %rdx
movq $0x7fffffffffffffff, %rax
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Reduce if top bit set
movq %r11, %rdx
shrq $63, %rdx
imulq $19, %rdx, %rcx
andq %rax, %r11
addq %rcx, %r8
adcq $0x00, %r9
adcq $0x00, %r10
adcq $0x00, %r11
# Store
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
leaq 48(%rsp), %rsi
# Double
movq (%rdi), %r8
movq 8(%rdi), %r9
addq %r8, %r8
movq 16(%rdi), %r10
adcq %r9, %r9
movq 24(%rdi), %rdx
adcq %r10, %r10
movq $-19, %rcx
adcq %rdx, %rdx
movq $0x7fffffffffffffff, %rax
movq %rdx, %r11
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
movq %r8, (%rsi)
movq %r9, 8(%rsi)
movq %r10, 16(%rsi)
movq %r11, 24(%rsi)
movq 8(%rsp), %rbx
movq 16(%rsp), %rbp
# Add
movq (%rbp), %r8
movq 8(%rbp), %r9
movq 16(%rbp), %r10
movq 24(%rbp), %rdx
movq %r8, %r12
addq (%rbx), %r8
movq %r9, %r13
adcq 8(%rbx), %r9
movq %r10, %r14
adcq 16(%rbx), %r10
movq %rdx, %r15
adcq 24(%rbx), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rbx), %r12
movq $0x00, %rdx
sbbq 8(%rbx), %r13
movq $-19, %rcx
sbbq 16(%rbx), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rbx), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rbx)
movq %r9, 8(%rbx)
movq %r10, 16(%rbx)
movq %r11, 24(%rbx)
movq %r12, (%rdi)
movq %r13, 8(%rdi)
movq %r14, 16(%rdi)
movq %r15, 24(%rdi)
movq 24(%rsp), %rdi
# Add
movq (%rsi), %r8
movq 8(%rsi), %r9
movq 16(%rsi), %r10
movq 24(%rsi), %rdx
movq %r8, %r12
addq (%rdi), %r8
movq %r9, %r13
adcq 8(%rdi), %r9
movq %r10, %r14
adcq 16(%rdi), %r10
movq %rdx, %r15
adcq 24(%rdi), %rdx
movq $-19, %rcx
movq %rdx, %r11
movq $0x7fffffffffffffff, %rax
sarq $63, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Sub modulus (if overflow)
subq %rcx, %r8
sbbq %rdx, %r9
sbbq %rdx, %r10
sbbq %rax, %r11
# Sub
subq (%rdi), %r12
movq $0x00, %rdx
sbbq 8(%rdi), %r13
movq $-19, %rcx
sbbq 16(%rdi), %r14
movq $0x7fffffffffffffff, %rax
sbbq 24(%rdi), %r15
sbbq $0x00, %rdx
# Mask the modulus
andq %rdx, %rcx
andq %rdx, %rax
# Add modulus (if underflow)
addq %rcx, %r12
adcq %rdx, %r13
adcq %rdx, %r14
adcq %rax, %r15
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, (%rbp)
movq %r13, 8(%rbp)
movq %r14, 16(%rbp)
movq %r15, 24(%rbp)
addq $0x50, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
repz retq
#ifndef __APPLE__
.size fe_ge_sub_avx2,.-fe_ge_sub_avx2
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */