mirror of
https://github.com/FreeRTOS/FreeRTOS-Kernel.git
synced 2025-12-13 23:25:10 -05:00
* deleted old version wolfSSL before updating * updated wolfSSL to the latest version(v4.4.0) * updated wolfSSL to the latest version(v4.4.0) * added macros for timing resistance Co-authored-by: RichardBarry <3073890+RichardBarry@users.noreply.github.com> Co-authored-by: Ming Yue <mingyue86010@gmail.com>
16542 lines
403 KiB
ArmAsm
16542 lines
403 KiB
ArmAsm
/* fe_x25519_asm
|
|
*
|
|
* Copyright (C) 2006-2020 wolfSSL Inc.
|
|
*
|
|
* This file is part of wolfSSL.
|
|
*
|
|
* wolfSSL is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* wolfSSL is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
|
*/
|
|
|
|
#ifndef HAVE_INTEL_AVX1
|
|
#define HAVE_INTEL_AVX1
|
|
#endif /* HAVE_INTEL_AVX1 */
|
|
#ifndef NO_AVX2_SUPPORT
|
|
#define HAVE_INTEL_AVX2
|
|
#endif /* NO_AVX2_SUPPORT */
|
|
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_init
|
|
.type fe_init,@function
|
|
.align 4
|
|
fe_init:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_init
|
|
.p2align 2
|
|
_fe_init:
|
|
#endif /* __APPLE__ */
|
|
#ifdef HAVE_INTEL_AVX2
|
|
#ifndef __APPLE__
|
|
movq cpuFlagsSet@GOTPCREL(%rip), %rax
|
|
movl (%rax), %eax
|
|
#else
|
|
movl _cpuFlagsSet(%rip), %eax
|
|
#endif /* __APPLE__ */
|
|
testl %eax, %eax
|
|
je L_fe_init_get_flags
|
|
repz retq
|
|
L_fe_init_get_flags:
|
|
#ifndef __APPLE__
|
|
callq cpuid_get_flags@plt
|
|
#else
|
|
callq _cpuid_get_flags
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq intelFlags@GOTPCREL(%rip), %rdx
|
|
movl %eax, (%rdx)
|
|
#else
|
|
movl %eax, _intelFlags(%rip)
|
|
#endif /* __APPLE__ */
|
|
andl $0x50, %eax
|
|
cmpl $0x50, %eax
|
|
jne L_fe_init_flags_done
|
|
#ifndef __APPLE__
|
|
movq fe_mul_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_mul_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_mul_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_mul_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_sq_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_sq_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_sq_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_sq_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_mul121666_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_mul121666_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_mul121666_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_mul121666_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_sq2_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_sq2_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_sq2_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_sq2_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_invert_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_invert_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_invert_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_invert_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq curve25519_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _curve25519_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq curve25519_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _curve25519_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_pow22523_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_pow22523_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_pow22523_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_pow22523_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_ge_to_p2_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_ge_to_p2_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_ge_to_p3_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_ge_to_p3_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_ge_dbl_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_ge_dbl_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_ge_madd_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_madd_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_ge_madd_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_ge_msub_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_msub_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_ge_msub_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_add_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_ge_add_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_add_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_ge_add_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax
|
|
#else
|
|
leaq _fe_ge_sub_avx2(%rip), %rax
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
movq fe_ge_sub_p@GOTPCREL(%rip), %rdx
|
|
movq %rax, (%rdx)
|
|
#else
|
|
movq %rax, _fe_ge_sub_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
L_fe_init_flags_done:
|
|
#ifndef __APPLE__
|
|
movq cpuFlagsSet@GOTPCREL(%rip), %rdx
|
|
movl $0x1, (%rdx)
|
|
#else
|
|
movl $0x1, _cpuFlagsSet(%rip)
|
|
#endif /* __APPLE__ */
|
|
#endif /* HAVE_INTEL_AVX2 */
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_init,.-fe_init
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_frombytes
|
|
.type fe_frombytes,@function
|
|
.align 4
|
|
fe_frombytes:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_frombytes
|
|
.p2align 2
|
|
_fe_frombytes:
|
|
#endif /* __APPLE__ */
|
|
movq $0x7fffffffffffffff, %r9
|
|
movq (%rsi), %rdx
|
|
movq 8(%rsi), %rax
|
|
movq 16(%rsi), %rcx
|
|
movq 24(%rsi), %r8
|
|
andq %r9, %r8
|
|
movq %rdx, (%rdi)
|
|
movq %rax, 8(%rdi)
|
|
movq %rcx, 16(%rdi)
|
|
movq %r8, 24(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_frombytes,.-fe_frombytes
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_tobytes
|
|
.type fe_tobytes,@function
|
|
.align 4
|
|
fe_tobytes:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_tobytes
|
|
.p2align 2
|
|
_fe_tobytes:
|
|
#endif /* __APPLE__ */
|
|
movq $0x7fffffffffffffff, %r10
|
|
movq (%rsi), %rdx
|
|
movq 8(%rsi), %rax
|
|
movq 16(%rsi), %rcx
|
|
movq 24(%rsi), %r8
|
|
addq $19, %rdx
|
|
adcq $0x00, %rax
|
|
adcq $0x00, %rcx
|
|
adcq $0x00, %r8
|
|
shrq $63, %r8
|
|
imulq $19, %r8, %r9
|
|
movq (%rsi), %rdx
|
|
movq 8(%rsi), %rax
|
|
movq 16(%rsi), %rcx
|
|
movq 24(%rsi), %r8
|
|
addq %r9, %rdx
|
|
adcq $0x00, %rax
|
|
adcq $0x00, %rcx
|
|
adcq $0x00, %r8
|
|
andq %r10, %r8
|
|
movq %rdx, (%rdi)
|
|
movq %rax, 8(%rdi)
|
|
movq %rcx, 16(%rdi)
|
|
movq %r8, 24(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_tobytes,.-fe_tobytes
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_1
|
|
.type fe_1,@function
|
|
.align 4
|
|
fe_1:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_1
|
|
.p2align 2
|
|
_fe_1:
|
|
#endif /* __APPLE__ */
|
|
# Set one
|
|
movq $0x01, (%rdi)
|
|
movq $0x00, 8(%rdi)
|
|
movq $0x00, 16(%rdi)
|
|
movq $0x00, 24(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_1,.-fe_1
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_0
|
|
.type fe_0,@function
|
|
.align 4
|
|
fe_0:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_0
|
|
.p2align 2
|
|
_fe_0:
|
|
#endif /* __APPLE__ */
|
|
# Set zero
|
|
movq $0x00, (%rdi)
|
|
movq $0x00, 8(%rdi)
|
|
movq $0x00, 16(%rdi)
|
|
movq $0x00, 24(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_0,.-fe_0
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_copy
|
|
.type fe_copy,@function
|
|
.align 4
|
|
fe_copy:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_copy
|
|
.p2align 2
|
|
_fe_copy:
|
|
#endif /* __APPLE__ */
|
|
# Copy
|
|
movq (%rsi), %rdx
|
|
movq 8(%rsi), %rax
|
|
movq 16(%rsi), %rcx
|
|
movq 24(%rsi), %r8
|
|
movq %rdx, (%rdi)
|
|
movq %rax, 8(%rdi)
|
|
movq %rcx, 16(%rdi)
|
|
movq %r8, 24(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_copy,.-fe_copy
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sub
|
|
.type fe_sub,@function
|
|
.align 4
|
|
fe_sub:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sub
|
|
.p2align 2
|
|
_fe_sub:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
# Sub
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %rcx
|
|
movq 16(%rsi), %r8
|
|
movq 24(%rsi), %r9
|
|
subq (%rdx), %rax
|
|
movq $0x00, %r10
|
|
sbbq 8(%rdx), %rcx
|
|
movq $-19, %r11
|
|
sbbq 16(%rdx), %r8
|
|
movq $0x7fffffffffffffff, %r12
|
|
sbbq 24(%rdx), %r9
|
|
sbbq $0x00, %r10
|
|
# Mask the modulus
|
|
andq %r10, %r11
|
|
andq %r10, %r12
|
|
# Add modulus (if underflow)
|
|
addq %r11, %rax
|
|
adcq %r10, %rcx
|
|
adcq %r10, %r8
|
|
adcq %r12, %r9
|
|
movq %rax, (%rdi)
|
|
movq %rcx, 8(%rdi)
|
|
movq %r8, 16(%rdi)
|
|
movq %r9, 24(%rdi)
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_sub,.-fe_sub
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_add
|
|
.type fe_add,@function
|
|
.align 4
|
|
fe_add:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_add
|
|
.p2align 2
|
|
_fe_add:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
# Add
|
|
movq (%rsi), %rax
|
|
movq 8(%rsi), %rcx
|
|
addq (%rdx), %rax
|
|
movq 16(%rsi), %r8
|
|
adcq 8(%rdx), %rcx
|
|
movq 24(%rsi), %r10
|
|
adcq 16(%rdx), %r8
|
|
movq $-19, %r11
|
|
adcq 24(%rdx), %r10
|
|
movq $0x7fffffffffffffff, %r12
|
|
movq %r10, %r9
|
|
sarq $63, %r10
|
|
# Mask the modulus
|
|
andq %r10, %r11
|
|
andq %r10, %r12
|
|
# Sub modulus (if overflow)
|
|
subq %r11, %rax
|
|
sbbq %r10, %rcx
|
|
sbbq %r10, %r8
|
|
sbbq %r12, %r9
|
|
movq %rax, (%rdi)
|
|
movq %rcx, 8(%rdi)
|
|
movq %r8, 16(%rdi)
|
|
movq %r9, 24(%rdi)
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_add,.-fe_add
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_neg
|
|
.type fe_neg,@function
|
|
.align 4
|
|
fe_neg:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_neg
|
|
.p2align 2
|
|
_fe_neg:
|
|
#endif /* __APPLE__ */
|
|
movq $-19, %rdx
|
|
movq $-1, %rax
|
|
movq $-1, %rcx
|
|
movq $0x7fffffffffffffff, %r8
|
|
subq (%rsi), %rdx
|
|
sbbq 8(%rsi), %rax
|
|
sbbq 16(%rsi), %rcx
|
|
sbbq 24(%rsi), %r8
|
|
movq %rdx, (%rdi)
|
|
movq %rax, 8(%rdi)
|
|
movq %rcx, 16(%rdi)
|
|
movq %r8, 24(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_neg,.-fe_neg
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_cmov
|
|
.type fe_cmov,@function
|
|
.align 4
|
|
fe_cmov:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_cmov
|
|
.p2align 2
|
|
_fe_cmov:
|
|
#endif /* __APPLE__ */
|
|
cmpl $0x01, %edx
|
|
movq (%rdi), %rcx
|
|
movq 8(%rdi), %r8
|
|
movq 16(%rdi), %r9
|
|
movq 24(%rdi), %r10
|
|
cmoveq (%rsi), %rcx
|
|
cmoveq 8(%rsi), %r8
|
|
cmoveq 16(%rsi), %r9
|
|
cmoveq 24(%rsi), %r10
|
|
movq %rcx, (%rdi)
|
|
movq %r8, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_cmov,.-fe_cmov
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_isnonzero
|
|
.type fe_isnonzero,@function
|
|
.align 4
|
|
fe_isnonzero:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_isnonzero
|
|
.p2align 2
|
|
_fe_isnonzero:
|
|
#endif /* __APPLE__ */
|
|
movq $0x7fffffffffffffff, %r10
|
|
movq (%rdi), %rax
|
|
movq 8(%rdi), %rdx
|
|
movq 16(%rdi), %rcx
|
|
movq 24(%rdi), %r8
|
|
addq $19, %rax
|
|
adcq $0x00, %rdx
|
|
adcq $0x00, %rcx
|
|
adcq $0x00, %r8
|
|
shrq $63, %r8
|
|
imulq $19, %r8, %r9
|
|
movq (%rdi), %rax
|
|
movq 8(%rdi), %rdx
|
|
movq 16(%rdi), %rcx
|
|
movq 24(%rdi), %r8
|
|
addq %r9, %rax
|
|
adcq $0x00, %rdx
|
|
adcq $0x00, %rcx
|
|
adcq $0x00, %r8
|
|
andq %r10, %r8
|
|
orq %rdx, %rax
|
|
orq %rcx, %rax
|
|
orq %r8, %rax
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_isnonzero,.-fe_isnonzero
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_isnegative
|
|
.type fe_isnegative,@function
|
|
.align 4
|
|
fe_isnegative:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_isnegative
|
|
.p2align 2
|
|
_fe_isnegative:
|
|
#endif /* __APPLE__ */
|
|
movq $0x7fffffffffffffff, %r11
|
|
movq (%rdi), %rdx
|
|
movq 8(%rdi), %rcx
|
|
movq 16(%rdi), %r8
|
|
movq 24(%rdi), %r9
|
|
movq %rdx, %rax
|
|
addq $19, %rdx
|
|
adcq $0x00, %rcx
|
|
adcq $0x00, %r8
|
|
adcq $0x00, %r9
|
|
shrq $63, %r9
|
|
imulq $19, %r9, %r10
|
|
addq %r10, %rax
|
|
andq $0x01, %rax
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_isnegative,.-fe_isnegative
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_cmov_table
|
|
.type fe_cmov_table,@function
|
|
.align 4
|
|
fe_cmov_table:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_cmov_table
|
|
.p2align 2
|
|
_fe_cmov_table:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
movq %rdx, %rcx
|
|
movsbq %cl, %rax
|
|
cdq
|
|
xorb %dl, %al
|
|
subb %dl, %al
|
|
movb %al, %r15b
|
|
movq $0x01, %rax
|
|
xorq %rdx, %rdx
|
|
xorq %r8, %r8
|
|
xorq %r9, %r9
|
|
movq $0x01, %r10
|
|
xorq %r11, %r11
|
|
xorq %r12, %r12
|
|
xorq %r13, %r13
|
|
cmpb $0x01, %r15b
|
|
movq (%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 8(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 16(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 24(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 32(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 40(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 48(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 56(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $2, %r15b
|
|
movq 96(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 104(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 112(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 120(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 128(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 136(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 144(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 152(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $3, %r15b
|
|
movq 192(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 200(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 208(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 216(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 224(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 232(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 240(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 248(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $4, %r15b
|
|
movq 288(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 296(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 304(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 312(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 320(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 328(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 336(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 344(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $5, %r15b
|
|
movq 384(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 392(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 400(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 408(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 416(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 424(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 432(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 440(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $6, %r15b
|
|
movq 480(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 488(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 496(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 504(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 512(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 520(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 528(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 536(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $7, %r15b
|
|
movq 576(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 584(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 592(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 600(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 608(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 616(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 624(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 632(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $8, %r15b
|
|
movq 672(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 680(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 688(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 696(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq 704(%rsi), %r14
|
|
cmoveq %r14, %r10
|
|
movq 712(%rsi), %r14
|
|
cmoveq %r14, %r11
|
|
movq 720(%rsi), %r14
|
|
cmoveq %r14, %r12
|
|
movq 728(%rsi), %r14
|
|
cmoveq %r14, %r13
|
|
cmpb $0x00, %cl
|
|
movq %rax, %r14
|
|
cmovlq %r10, %rax
|
|
cmovlq %r14, %r10
|
|
movq %rdx, %r14
|
|
cmovlq %r11, %rdx
|
|
cmovlq %r14, %r11
|
|
movq %r8, %r14
|
|
cmovlq %r12, %r8
|
|
cmovlq %r14, %r12
|
|
movq %r9, %r14
|
|
cmovlq %r13, %r9
|
|
cmovlq %r14, %r13
|
|
movq %rax, (%rdi)
|
|
movq %rdx, 8(%rdi)
|
|
movq %r8, 16(%rdi)
|
|
movq %r9, 24(%rdi)
|
|
movq %r10, 32(%rdi)
|
|
movq %r11, 40(%rdi)
|
|
movq %r12, 48(%rdi)
|
|
movq %r13, 56(%rdi)
|
|
xorq %rax, %rax
|
|
xorq %rdx, %rdx
|
|
xorq %r8, %r8
|
|
xorq %r9, %r9
|
|
cmpb $0x01, %r15b
|
|
movq 64(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 72(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 80(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 88(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
cmpb $2, %r15b
|
|
movq 160(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 168(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 176(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 184(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
cmpb $3, %r15b
|
|
movq 256(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 264(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 272(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 280(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
cmpb $4, %r15b
|
|
movq 352(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 360(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 368(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 376(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
cmpb $5, %r15b
|
|
movq 448(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 456(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 464(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 472(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
cmpb $6, %r15b
|
|
movq 544(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 552(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 560(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 568(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
cmpb $7, %r15b
|
|
movq 640(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 648(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 656(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 664(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
cmpb $8, %r15b
|
|
movq 736(%rsi), %r14
|
|
cmoveq %r14, %rax
|
|
movq 744(%rsi), %r14
|
|
cmoveq %r14, %rdx
|
|
movq 752(%rsi), %r14
|
|
cmoveq %r14, %r8
|
|
movq 760(%rsi), %r14
|
|
cmoveq %r14, %r9
|
|
movq $-19, %r10
|
|
movq $-1, %r11
|
|
movq $-1, %r12
|
|
movq $0x7fffffffffffffff, %r13
|
|
subq %rax, %r10
|
|
sbbq %rdx, %r11
|
|
sbbq %r8, %r12
|
|
sbbq %r9, %r13
|
|
cmpb $0x00, %cl
|
|
cmovlq %r10, %rax
|
|
cmovlq %r11, %rdx
|
|
cmovlq %r12, %r8
|
|
cmovlq %r13, %r9
|
|
movq %rax, 64(%rdi)
|
|
movq %rdx, 72(%rdi)
|
|
movq %r8, 80(%rdi)
|
|
movq %r9, 88(%rdi)
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_cmov_table,.-fe_cmov_table
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_mul
|
|
.type fe_mul,@function
|
|
.align 4
|
|
fe_mul:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_mul
|
|
.p2align 2
|
|
_fe_mul:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_mul_p(%rip)
|
|
#else
|
|
jmpq *_fe_mul_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_mul,.-fe_mul
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq
|
|
.type fe_sq,@function
|
|
.align 4
|
|
fe_sq:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq
|
|
.p2align 2
|
|
_fe_sq:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_sq_p(%rip)
|
|
#else
|
|
jmpq *_fe_sq_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_sq,.-fe_sq
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_mul121666
|
|
.type fe_mul121666,@function
|
|
.align 4
|
|
fe_mul121666:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_mul121666
|
|
.p2align 2
|
|
_fe_mul121666:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_mul121666_p(%rip)
|
|
#else
|
|
jmpq *_fe_mul121666_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_mul121666,.-fe_mul121666
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq2
|
|
.type fe_sq2,@function
|
|
.align 4
|
|
fe_sq2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq2
|
|
.p2align 2
|
|
_fe_sq2:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_sq2_p(%rip)
|
|
#else
|
|
jmpq *_fe_sq2_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_sq2,.-fe_sq2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_invert
|
|
.type fe_invert,@function
|
|
.align 4
|
|
fe_invert:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_invert
|
|
.p2align 2
|
|
_fe_invert:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_invert_p(%rip)
|
|
#else
|
|
jmpq *_fe_invert_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_invert,.-fe_invert
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl curve25519
|
|
.type curve25519,@function
|
|
.align 4
|
|
curve25519:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _curve25519
|
|
.p2align 2
|
|
_curve25519:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *curve25519_p(%rip)
|
|
#else
|
|
jmpq *_curve25519_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size curve25519,.-curve25519
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_pow22523
|
|
.type fe_pow22523,@function
|
|
.align 4
|
|
fe_pow22523:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_pow22523
|
|
.p2align 2
|
|
_fe_pow22523:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_pow22523_p(%rip)
|
|
#else
|
|
jmpq *_fe_pow22523_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_pow22523,.-fe_pow22523
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_to_p2
|
|
.type fe_ge_to_p2,@function
|
|
.align 4
|
|
fe_ge_to_p2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_to_p2
|
|
.p2align 2
|
|
_fe_ge_to_p2:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_ge_to_p2_p(%rip)
|
|
#else
|
|
jmpq *_fe_ge_to_p2_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_ge_to_p2,.-fe_ge_to_p2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_to_p3
|
|
.type fe_ge_to_p3,@function
|
|
.align 4
|
|
fe_ge_to_p3:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_to_p3
|
|
.p2align 2
|
|
_fe_ge_to_p3:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_ge_to_p3_p(%rip)
|
|
#else
|
|
jmpq *_fe_ge_to_p3_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_ge_to_p3,.-fe_ge_to_p3
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_dbl
|
|
.type fe_ge_dbl,@function
|
|
.align 4
|
|
fe_ge_dbl:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_dbl
|
|
.p2align 2
|
|
_fe_ge_dbl:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_ge_dbl_p(%rip)
|
|
#else
|
|
jmpq *_fe_ge_dbl_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_ge_dbl,.-fe_ge_dbl
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_madd
|
|
.type fe_ge_madd,@function
|
|
.align 4
|
|
fe_ge_madd:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_madd
|
|
.p2align 2
|
|
_fe_ge_madd:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_ge_madd_p(%rip)
|
|
#else
|
|
jmpq *_fe_ge_madd_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_ge_madd,.-fe_ge_madd
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_msub
|
|
.type fe_ge_msub,@function
|
|
.align 4
|
|
fe_ge_msub:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_msub
|
|
.p2align 2
|
|
_fe_ge_msub:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_ge_msub_p(%rip)
|
|
#else
|
|
jmpq *_fe_ge_msub_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_ge_msub,.-fe_ge_msub
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_add
|
|
.type fe_ge_add,@function
|
|
.align 4
|
|
fe_ge_add:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_add
|
|
.p2align 2
|
|
_fe_ge_add:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_ge_add_p(%rip)
|
|
#else
|
|
jmpq *_fe_ge_add_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_ge_add,.-fe_ge_add
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_sub
|
|
.type fe_ge_sub,@function
|
|
.align 4
|
|
fe_ge_sub:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_sub
|
|
.p2align 2
|
|
_fe_ge_sub:
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
jmpq *fe_ge_sub_p(%rip)
|
|
#else
|
|
jmpq *_fe_ge_sub_p(%rip)
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.size fe_ge_sub,.-fe_ge_sub
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type cpuFlagsSet, @object
|
|
.size cpuFlagsSet,4
|
|
cpuFlagsSet:
|
|
.long 0
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_cpuFlagsSet:
|
|
.long 0
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type intelFlags, @object
|
|
.size intelFlags,4
|
|
intelFlags:
|
|
.long 0
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_intelFlags:
|
|
.long 0
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_mul_p, @object
|
|
.size fe_mul_p,8
|
|
fe_mul_p:
|
|
.quad fe_mul_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_mul_p:
|
|
.quad _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_sq_p, @object
|
|
.size fe_sq_p,8
|
|
fe_sq_p:
|
|
.quad fe_sq_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_sq_p:
|
|
.quad _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_mul121666_p, @object
|
|
.size fe_mul121666_p,8
|
|
fe_mul121666_p:
|
|
.quad fe_mul121666_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_mul121666_p:
|
|
.quad _fe_mul121666_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_sq2_p, @object
|
|
.size fe_sq2_p,8
|
|
fe_sq2_p:
|
|
.quad fe_sq2_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_sq2_p:
|
|
.quad _fe_sq2_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_invert_p, @object
|
|
.size fe_invert_p,8
|
|
fe_invert_p:
|
|
.quad fe_invert_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_invert_p:
|
|
.quad _fe_invert_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type curve25519_p, @object
|
|
.size curve25519_p,8
|
|
curve25519_p:
|
|
.quad curve25519_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_curve25519_p:
|
|
.quad _curve25519_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_pow22523_p, @object
|
|
.size fe_pow22523_p,8
|
|
fe_pow22523_p:
|
|
.quad fe_pow22523_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_pow22523_p:
|
|
.quad _fe_pow22523_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_ge_to_p2_p, @object
|
|
.size fe_ge_to_p2_p,8
|
|
fe_ge_to_p2_p:
|
|
.quad fe_ge_to_p2_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_ge_to_p2_p:
|
|
.quad _fe_ge_to_p2_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_ge_to_p3_p, @object
|
|
.size fe_ge_to_p3_p,8
|
|
fe_ge_to_p3_p:
|
|
.quad fe_ge_to_p3_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_ge_to_p3_p:
|
|
.quad _fe_ge_to_p3_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_ge_dbl_p, @object
|
|
.size fe_ge_dbl_p,8
|
|
fe_ge_dbl_p:
|
|
.quad fe_ge_dbl_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_ge_dbl_p:
|
|
.quad _fe_ge_dbl_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_ge_madd_p, @object
|
|
.size fe_ge_madd_p,8
|
|
fe_ge_madd_p:
|
|
.quad fe_ge_madd_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_ge_madd_p:
|
|
.quad _fe_ge_madd_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_ge_msub_p, @object
|
|
.size fe_ge_msub_p,8
|
|
fe_ge_msub_p:
|
|
.quad fe_ge_msub_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_ge_msub_p:
|
|
.quad _fe_ge_msub_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_ge_add_p, @object
|
|
.size fe_ge_add_p,8
|
|
fe_ge_add_p:
|
|
.quad fe_ge_add_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_ge_add_p:
|
|
.quad _fe_ge_add_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.data
|
|
.type fe_ge_sub_p, @object
|
|
.size fe_ge_sub_p,8
|
|
fe_ge_sub_p:
|
|
.quad fe_ge_sub_x64
|
|
#else
|
|
.section __DATA,__data
|
|
.p2align 2
|
|
_fe_ge_sub_p:
|
|
.quad _fe_ge_sub_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_mul_x64
|
|
.type fe_mul_x64,@function
|
|
.align 4
|
|
fe_mul_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_mul_x64
|
|
.p2align 2
|
|
_fe_mul_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbx
|
|
movq %rdx, %rcx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rcx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rcx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rcx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rcx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rcx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rcx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rcx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rcx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rcx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rcx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rcx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rcx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rcx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rcx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rcx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rcx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
popq %rbx
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_mul_x64,.-fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq_x64
|
|
.type fe_sq_x64,@function
|
|
.align 4
|
|
fe_sq_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq_x64
|
|
.p2align 2
|
|
_fe_sq_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rax
|
|
mulq 8(%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * A[2]
|
|
movq (%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[0] * A[3]
|
|
movq (%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * A[2]
|
|
movq 8(%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[2] * A[3]
|
|
movq 16(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# Double
|
|
xorq %r14, %r14
|
|
addq %r8, %r8
|
|
adcq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq $0x00, %r14
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rax
|
|
mulq %rax
|
|
movq %rax, %rcx
|
|
movq %rdx, %r15
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rax
|
|
mulq %rax
|
|
addq %r15, %r8
|
|
adcq %rax, %r9
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %r15
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rax
|
|
mulq %rax
|
|
addq %r15, %r10
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %r15
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rax
|
|
mulq %rax
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
addq %r15, %r12
|
|
adcq $0x00, %r13
|
|
adcq $0x00, %r14
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %r15
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
shldq $0x01, %r10, %r11
|
|
andq %r15, %r10
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r11
|
|
xorq %r11, %r11
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r11
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
# Add remaining product results in
|
|
addq %r11, %r8
|
|
adcq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r10, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %r15, %r10
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
# Reduce if top bit set
|
|
movq %r10, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %r15, %r10
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
# Store
|
|
movq %rcx, (%rdi)
|
|
movq %r8, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_sq_x64,.-fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq_n_x64
|
|
.type fe_sq_n_x64,@function
|
|
.align 4
|
|
fe_sq_n_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq_n_x64
|
|
.p2align 2
|
|
_fe_sq_n_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbx
|
|
movq %rdx, %rcx
|
|
L_fe_sq_n_x64:
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rax
|
|
mulq 8(%rsi)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq (%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq (%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 8(%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 16(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rax
|
|
mulq %rax
|
|
movq %rax, %r8
|
|
movq %rdx, %rbx
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rax
|
|
mulq %rax
|
|
addq %rbx, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbx
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rax
|
|
mulq %rax
|
|
addq %rbx, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbx
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rbx, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
decb %cl
|
|
jnz L_fe_sq_n_x64
|
|
popq %rbx
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_sq_n_x64,.-fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_mul121666_x64
|
|
.type fe_mul121666_x64,@function
|
|
.align 4
|
|
fe_mul121666_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_mul121666_x64
|
|
.p2align 2
|
|
_fe_mul121666_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
# Multiply by 121666
|
|
movq $0x1db42, %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
movq $0x1db42, %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
movq $0x1db42, %rax
|
|
mulq 16(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
movq $0x1db42, %rax
|
|
mulq 24(%rsi)
|
|
movq $0x7fffffffffffffff, %rcx
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
movq $19, %rax
|
|
mulq %r12
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_mul121666_x64,.-fe_mul121666_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq2_x64
|
|
.type fe_sq2_x64,@function
|
|
.align 4
|
|
fe_sq2_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq2_x64
|
|
.p2align 2
|
|
_fe_sq2_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbx
|
|
# Square * 2
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rax
|
|
mulq 8(%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * A[2]
|
|
movq (%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[0] * A[3]
|
|
movq (%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * A[2]
|
|
movq 8(%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[2] * A[3]
|
|
movq 16(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# Double
|
|
xorq %r14, %r14
|
|
addq %r8, %r8
|
|
adcq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq $0x00, %r14
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rax
|
|
mulq %rax
|
|
movq %rax, %rcx
|
|
movq %rdx, %r15
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rax
|
|
mulq %rax
|
|
addq %r15, %r8
|
|
adcq %rax, %r9
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %r15
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rax
|
|
mulq %rax
|
|
addq %r15, %r10
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %r15
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rax
|
|
mulq %rax
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
addq %r15, %r12
|
|
adcq $0x00, %r13
|
|
adcq $0x00, %r14
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
xorq %rax, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $3, %r14, %rax
|
|
shldq $2, %r13, %r14
|
|
shldq $2, %r12, %r13
|
|
shldq $2, %r11, %r12
|
|
shldq $2, %r10, %r11
|
|
shldq $0x01, %r9, %r10
|
|
shldq $0x01, %r8, %r9
|
|
shldq $0x01, %rcx, %r8
|
|
shlq $0x01, %rcx
|
|
andq %rbx, %r10
|
|
# Two out left, one in right
|
|
andq %rbx, %r14
|
|
# Multiply top bits by 19*19
|
|
imulq $0x169, %rax, %r15
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r11
|
|
xorq %r11, %r11
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r11
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
# Add remaining produce results in
|
|
addq %r15, %rcx
|
|
adcq %r11, %r8
|
|
adcq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r10, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r10
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
# Reduce if top bit set
|
|
movq %r10, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r10
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
# Store
|
|
movq %rcx, (%rdi)
|
|
movq %r8, 8(%rdi)
|
|
movq %r9, 16(%rdi)
|
|
movq %r10, 24(%rdi)
|
|
popq %rbx
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_sq2_x64,.-fe_sq2_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_invert_x64
|
|
.type fe_invert_x64,@function
|
|
.align 4
|
|
fe_invert_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_invert_x64
|
|
.p2align 2
|
|
_fe_invert_x64:
|
|
#endif /* __APPLE__ */
|
|
subq $0x90, %rsp
|
|
# Invert
|
|
movq %rdi, 128(%rsp)
|
|
movq %rsi, 136(%rsp)
|
|
movq %rsp, %rdi
|
|
movq 136(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq 136(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $19, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $0x63, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
movq 128(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
movq 136(%rsp), %rsi
|
|
movq 128(%rsp), %rdi
|
|
addq $0x90, %rsp
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl curve25519_x64
|
|
.type curve25519_x64,@function
|
|
.align 4
|
|
curve25519_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _curve25519_x64
|
|
.p2align 2
|
|
_curve25519_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbx
|
|
pushq %rbp
|
|
movq %rdx, %r8
|
|
subq $0xb8, %rsp
|
|
xorq %rbx, %rbx
|
|
movq %rdi, 176(%rsp)
|
|
# Set one
|
|
movq $0x01, (%rdi)
|
|
movq $0x00, 8(%rdi)
|
|
movq $0x00, 16(%rdi)
|
|
movq $0x00, 24(%rdi)
|
|
# Set zero
|
|
movq $0x00, (%rsp)
|
|
movq $0x00, 8(%rsp)
|
|
movq $0x00, 16(%rsp)
|
|
movq $0x00, 24(%rsp)
|
|
# Set one
|
|
movq $0x01, 32(%rsp)
|
|
movq $0x00, 40(%rsp)
|
|
movq $0x00, 48(%rsp)
|
|
movq $0x00, 56(%rsp)
|
|
# Copy
|
|
movq (%r8), %rcx
|
|
movq 8(%r8), %r9
|
|
movq 16(%r8), %r10
|
|
movq 24(%r8), %r11
|
|
movq %rcx, 64(%rsp)
|
|
movq %r9, 72(%rsp)
|
|
movq %r10, 80(%rsp)
|
|
movq %r11, 88(%rsp)
|
|
movb $62, 168(%rsp)
|
|
movq $3, 160(%rsp)
|
|
L_curve25519_x64_words:
|
|
L_curve25519_x64_bits:
|
|
movq 160(%rsp), %r9
|
|
movb 168(%rsp), %cl
|
|
movq (%rsi,%r9,8), %rbp
|
|
shrq %cl, %rbp
|
|
andq $0x01, %rbp
|
|
xorq %rbp, %rbx
|
|
negq %rbx
|
|
# Conditional Swap
|
|
movq (%rdi), %rcx
|
|
movq 8(%rdi), %r9
|
|
movq 16(%rdi), %r10
|
|
movq 24(%rdi), %r11
|
|
xorq 64(%rsp), %rcx
|
|
xorq 72(%rsp), %r9
|
|
xorq 80(%rsp), %r10
|
|
xorq 88(%rsp), %r11
|
|
andq %rbx, %rcx
|
|
andq %rbx, %r9
|
|
andq %rbx, %r10
|
|
andq %rbx, %r11
|
|
xorq %rcx, (%rdi)
|
|
xorq %r9, 8(%rdi)
|
|
xorq %r10, 16(%rdi)
|
|
xorq %r11, 24(%rdi)
|
|
xorq %rcx, 64(%rsp)
|
|
xorq %r9, 72(%rsp)
|
|
xorq %r10, 80(%rsp)
|
|
xorq %r11, 88(%rsp)
|
|
# Conditional Swap
|
|
movq (%rsp), %rcx
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
xorq 32(%rsp), %rcx
|
|
xorq 40(%rsp), %r9
|
|
xorq 48(%rsp), %r10
|
|
xorq 56(%rsp), %r11
|
|
andq %rbx, %rcx
|
|
andq %rbx, %r9
|
|
andq %rbx, %r10
|
|
andq %rbx, %r11
|
|
xorq %rcx, (%rsp)
|
|
xorq %r9, 8(%rsp)
|
|
xorq %r10, 16(%rsp)
|
|
xorq %r11, 24(%rsp)
|
|
xorq %rcx, 32(%rsp)
|
|
xorq %r9, 40(%rsp)
|
|
xorq %r10, 48(%rsp)
|
|
xorq %r11, 56(%rsp)
|
|
movq %rbp, %rbx
|
|
# Add
|
|
movq (%rdi), %rcx
|
|
movq 8(%rdi), %r9
|
|
movq 16(%rdi), %r10
|
|
movq 24(%rdi), %rbp
|
|
movq %rcx, %r12
|
|
addq (%rsp), %rcx
|
|
movq %r9, %r13
|
|
adcq 8(%rsp), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rsp), %r10
|
|
movq %rbp, %r15
|
|
adcq 24(%rsp), %rbp
|
|
movq $-19, %rax
|
|
movq %rbp, %r11
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sarq $63, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %rcx
|
|
sbbq %rbp, %r9
|
|
sbbq %rbp, %r10
|
|
sbbq %rdx, %r11
|
|
# Sub
|
|
subq (%rsp), %r12
|
|
movq $0x00, %rbp
|
|
sbbq 8(%rsp), %r13
|
|
movq $-19, %rax
|
|
sbbq 16(%rsp), %r14
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rsp), %r15
|
|
sbbq $0x00, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r12
|
|
adcq %rbp, %r13
|
|
adcq %rbp, %r14
|
|
adcq %rdx, %r15
|
|
movq %rcx, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, 128(%rsp)
|
|
movq %r13, 136(%rsp)
|
|
movq %r14, 144(%rsp)
|
|
movq %r15, 152(%rsp)
|
|
# Add
|
|
movq 64(%rsp), %rcx
|
|
movq 72(%rsp), %r9
|
|
movq 80(%rsp), %r10
|
|
movq 88(%rsp), %rbp
|
|
movq %rcx, %r12
|
|
addq 32(%rsp), %rcx
|
|
movq %r9, %r13
|
|
adcq 40(%rsp), %r9
|
|
movq %r10, %r14
|
|
adcq 48(%rsp), %r10
|
|
movq %rbp, %r15
|
|
adcq 56(%rsp), %rbp
|
|
movq $-19, %rax
|
|
movq %rbp, %r11
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sarq $63, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %rcx
|
|
sbbq %rbp, %r9
|
|
sbbq %rbp, %r10
|
|
sbbq %rdx, %r11
|
|
# Sub
|
|
subq 32(%rsp), %r12
|
|
movq $0x00, %rbp
|
|
sbbq 40(%rsp), %r13
|
|
movq $-19, %rax
|
|
sbbq 48(%rsp), %r14
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 56(%rsp), %r15
|
|
sbbq $0x00, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r12
|
|
adcq %rbp, %r13
|
|
adcq %rbp, %r14
|
|
adcq %rdx, %r15
|
|
movq %rcx, (%rsp)
|
|
movq %r9, 8(%rsp)
|
|
movq %r10, 16(%rsp)
|
|
movq %r11, 24(%rsp)
|
|
movq %r12, 96(%rsp)
|
|
movq %r13, 104(%rsp)
|
|
movq %r14, 112(%rsp)
|
|
movq %r15, 120(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rdi), %rax
|
|
mulq 96(%rsp)
|
|
movq %rax, %rcx
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rdi), %rax
|
|
mulq 96(%rsp)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rdi), %rax
|
|
mulq 104(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rdi), %rax
|
|
mulq 96(%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rdi), %rax
|
|
mulq 104(%rsp)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rdi), %rax
|
|
mulq 112(%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rdi), %rax
|
|
mulq 96(%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rdi), %rax
|
|
mulq 104(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rdi), %rax
|
|
mulq 112(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rdi), %rax
|
|
mulq 120(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rdi), %rax
|
|
mulq 104(%rsp)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rdi), %rax
|
|
mulq 112(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rdi), %rax
|
|
mulq 120(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rdi), %rax
|
|
mulq 112(%rsp)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rdi), %rax
|
|
mulq 120(%rsp)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rdi), %rax
|
|
mulq 120(%rsp)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq %r10, 48(%rsp)
|
|
movq %r11, 56(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq 128(%rsp), %rax
|
|
mulq (%rsp)
|
|
movq %rax, %rcx
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 136(%rsp), %rax
|
|
mulq (%rsp)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq 128(%rsp), %rax
|
|
mulq 8(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 144(%rsp), %rax
|
|
mulq (%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 136(%rsp), %rax
|
|
mulq 8(%rsp)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq 128(%rsp), %rax
|
|
mulq 16(%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 152(%rsp), %rax
|
|
mulq (%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 144(%rsp), %rax
|
|
mulq 8(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 136(%rsp), %rax
|
|
mulq 16(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq 128(%rsp), %rax
|
|
mulq 24(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 152(%rsp), %rax
|
|
mulq 8(%rsp)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 144(%rsp), %rax
|
|
mulq 16(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 136(%rsp), %rax
|
|
mulq 24(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 152(%rsp), %rax
|
|
mulq 16(%rsp)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 144(%rsp), %rax
|
|
mulq 24(%rsp)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 152(%rsp), %rax
|
|
mulq 24(%rsp)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, (%rsp)
|
|
movq %r9, 8(%rsp)
|
|
movq %r10, 16(%rsp)
|
|
movq %r11, 24(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq 128(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq 128(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq 128(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 136(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 136(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 144(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq 128(%rsp), %rax
|
|
mulq %rax
|
|
movq %rax, %rcx
|
|
movq %rdx, %rbp
|
|
# A[1] * A[1]
|
|
movq 136(%rsp), %rax
|
|
mulq %rax
|
|
addq %rbp, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[2] * A[2]
|
|
movq 144(%rsp), %rax
|
|
mulq %rax
|
|
addq %rbp, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[3] * A[3]
|
|
movq 152(%rsp), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rbp, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, 96(%rsp)
|
|
movq %r9, 104(%rsp)
|
|
movq %r10, 112(%rsp)
|
|
movq %r11, 120(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rdi), %rax
|
|
mulq 8(%rdi)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq (%rdi), %rax
|
|
mulq 16(%rdi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq (%rdi), %rax
|
|
mulq 24(%rdi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 8(%rdi), %rax
|
|
mulq 16(%rdi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 8(%rdi), %rax
|
|
mulq 24(%rdi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 16(%rdi), %rax
|
|
mulq 24(%rdi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq (%rdi), %rax
|
|
mulq %rax
|
|
movq %rax, %rcx
|
|
movq %rdx, %rbp
|
|
# A[1] * A[1]
|
|
movq 8(%rdi), %rax
|
|
mulq %rax
|
|
addq %rbp, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[2] * A[2]
|
|
movq 16(%rdi), %rax
|
|
mulq %rax
|
|
addq %rbp, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[3] * A[3]
|
|
movq 24(%rdi), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rbp, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, 128(%rsp)
|
|
movq %r9, 136(%rsp)
|
|
movq %r10, 144(%rsp)
|
|
movq %r11, 152(%rsp)
|
|
# Add
|
|
movq 32(%rsp), %rcx
|
|
movq 40(%rsp), %r9
|
|
movq 48(%rsp), %r10
|
|
movq 56(%rsp), %rbp
|
|
movq %rcx, %r12
|
|
addq (%rsp), %rcx
|
|
movq %r9, %r13
|
|
adcq 8(%rsp), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rsp), %r10
|
|
movq %rbp, %r15
|
|
adcq 24(%rsp), %rbp
|
|
movq $-19, %rax
|
|
movq %rbp, %r11
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sarq $63, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %rcx
|
|
sbbq %rbp, %r9
|
|
sbbq %rbp, %r10
|
|
sbbq %rdx, %r11
|
|
# Sub
|
|
subq (%rsp), %r12
|
|
movq $0x00, %rbp
|
|
sbbq 8(%rsp), %r13
|
|
movq $-19, %rax
|
|
sbbq 16(%rsp), %r14
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rsp), %r15
|
|
sbbq $0x00, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r12
|
|
adcq %rbp, %r13
|
|
adcq %rbp, %r14
|
|
adcq %rdx, %r15
|
|
movq %rcx, 64(%rsp)
|
|
movq %r9, 72(%rsp)
|
|
movq %r10, 80(%rsp)
|
|
movq %r11, 88(%rsp)
|
|
movq %r12, (%rsp)
|
|
movq %r13, 8(%rsp)
|
|
movq %r14, 16(%rsp)
|
|
movq %r15, 24(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
movq %rax, %rcx
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
# Sub
|
|
movq 128(%rsp), %rcx
|
|
movq 136(%rsp), %r9
|
|
movq 144(%rsp), %r10
|
|
movq 152(%rsp), %r11
|
|
subq 96(%rsp), %rcx
|
|
movq $0x00, %rbp
|
|
sbbq 104(%rsp), %r9
|
|
movq $-19, %rax
|
|
sbbq 112(%rsp), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 120(%rsp), %r11
|
|
sbbq $0x00, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %rcx
|
|
adcq %rbp, %r9
|
|
adcq %rbp, %r10
|
|
adcq %rdx, %r11
|
|
movq %rcx, 128(%rsp)
|
|
movq %r9, 136(%rsp)
|
|
movq %r10, 144(%rsp)
|
|
movq %r11, 152(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsp), %rax
|
|
mulq 8(%rsp)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq (%rsp), %rax
|
|
mulq 16(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq (%rsp), %rax
|
|
mulq 24(%rsp)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 8(%rsp), %rax
|
|
mulq 16(%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 8(%rsp), %rax
|
|
mulq 24(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 16(%rsp), %rax
|
|
mulq 24(%rsp)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsp), %rax
|
|
mulq %rax
|
|
movq %rax, %rcx
|
|
movq %rdx, %rbp
|
|
# A[1] * A[1]
|
|
movq 8(%rsp), %rax
|
|
mulq %rax
|
|
addq %rbp, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[2] * A[2]
|
|
movq 16(%rsp), %rax
|
|
mulq %rax
|
|
addq %rbp, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[3] * A[3]
|
|
movq 24(%rsp), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rbp, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, (%rsp)
|
|
movq %r9, 8(%rsp)
|
|
movq %r10, 16(%rsp)
|
|
movq %r11, 24(%rsp)
|
|
# Multiply by 121666
|
|
movq $0x1db42, %rax
|
|
mulq 128(%rsp)
|
|
xorq %r10, %r10
|
|
movq %rax, %rcx
|
|
movq %rdx, %r9
|
|
movq $0x1db42, %rax
|
|
mulq 136(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
movq $0x1db42, %rax
|
|
mulq 144(%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
movq $0x1db42, %rax
|
|
mulq 152(%rsp)
|
|
movq $0x7fffffffffffffff, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r13
|
|
shldq $0x01, %r11, %r13
|
|
andq %r12, %r11
|
|
movq $19, %rax
|
|
mulq %r13
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
movq %rcx, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq %r10, 48(%rsp)
|
|
movq %r11, 56(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq 64(%rsp), %rax
|
|
mulq 72(%rsp)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq 64(%rsp), %rax
|
|
mulq 80(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq 64(%rsp), %rax
|
|
mulq 88(%rsp)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 72(%rsp), %rax
|
|
mulq 80(%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 72(%rsp), %rax
|
|
mulq 88(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 80(%rsp), %rax
|
|
mulq 88(%rsp)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq 64(%rsp), %rax
|
|
mulq %rax
|
|
movq %rax, %rcx
|
|
movq %rdx, %rbp
|
|
# A[1] * A[1]
|
|
movq 72(%rsp), %rax
|
|
mulq %rax
|
|
addq %rbp, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[2] * A[2]
|
|
movq 80(%rsp), %rax
|
|
mulq %rax
|
|
addq %rbp, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rbp
|
|
# A[3] * A[3]
|
|
movq 88(%rsp), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rbp, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, 64(%rsp)
|
|
movq %r9, 72(%rsp)
|
|
movq %r10, 80(%rsp)
|
|
movq %r11, 88(%rsp)
|
|
# Add
|
|
movq 96(%rsp), %rcx
|
|
movq 104(%rsp), %r9
|
|
addq 32(%rsp), %rcx
|
|
movq 112(%rsp), %r10
|
|
adcq 40(%rsp), %r9
|
|
movq 120(%rsp), %rbp
|
|
adcq 48(%rsp), %r10
|
|
movq $-19, %rax
|
|
adcq 56(%rsp), %rbp
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rbp, %r11
|
|
sarq $63, %rbp
|
|
# Mask the modulus
|
|
andq %rbp, %rax
|
|
andq %rbp, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %rcx
|
|
sbbq %rbp, %r9
|
|
sbbq %rbp, %r10
|
|
sbbq %rdx, %r11
|
|
movq %rcx, 96(%rsp)
|
|
movq %r9, 104(%rsp)
|
|
movq %r10, 112(%rsp)
|
|
movq %r11, 120(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq (%r8)
|
|
movq %rax, %rcx
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq (%r8)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq 8(%r8)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq (%r8)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq 8(%r8)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq 16(%r8)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq (%r8)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq 8(%r8)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq 16(%r8)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq 24(%r8)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq 8(%r8)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq 16(%r8)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq 24(%r8)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq 16(%r8)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq 24(%r8)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq 24(%r8)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq %r10, 48(%rsp)
|
|
movq %r11, 56(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
movq %rax, %rcx
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 128(%rsp)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq 96(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 136(%rsp)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 104(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 144(%rsp)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 112(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 120(%rsp), %rax
|
|
mulq 152(%rsp)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, (%rsp)
|
|
movq %r9, 8(%rsp)
|
|
movq %r10, 16(%rsp)
|
|
movq %r11, 24(%rsp)
|
|
decb 168(%rsp)
|
|
jge L_curve25519_x64_bits
|
|
movq $63, 168(%rsp)
|
|
decb 160(%rsp)
|
|
jge L_curve25519_x64_words
|
|
# Invert
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 96(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
movq $19, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
leaq 96(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
movq $0x63, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
leaq 96(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
movq 176(%rsp), %rdi
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq (%rdi)
|
|
movq %rax, %rcx
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq (%rdi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq 8(%rdi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq (%rdi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq 8(%rdi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq 16(%rdi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq (%rdi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq 8(%rdi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq 16(%rdi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rsp), %rax
|
|
mulq 24(%rdi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq 8(%rdi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq 16(%rdi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rsp), %rax
|
|
mulq 24(%rdi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq 16(%rdi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rsp), %rax
|
|
mulq 24(%rdi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rsp), %rax
|
|
mulq 24(%rdi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rbp, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %rcx
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbp, %r11
|
|
addq %rax, %rcx
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %rcx, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
xorq %rax, %rax
|
|
addq $0xb8, %rsp
|
|
popq %rbp
|
|
popq %rbx
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size curve25519_x64,.-curve25519_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_pow22523_x64
|
|
.type fe_pow22523_x64,@function
|
|
.align 4
|
|
fe_pow22523_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_pow22523_x64
|
|
.p2align 2
|
|
_fe_pow22523_x64:
|
|
#endif /* __APPLE__ */
|
|
subq $0x70, %rsp
|
|
# pow22523
|
|
movq %rdi, 96(%rsp)
|
|
movq %rsi, 104(%rsp)
|
|
movq %rsp, %rdi
|
|
movq 104(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq 104(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $19, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $0x63, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_x64@plt
|
|
#else
|
|
callq _fe_sq_n_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_x64@plt
|
|
#else
|
|
callq _fe_sq_x64
|
|
#endif /* __APPLE__ */
|
|
movq 96(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
movq 104(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_x64@plt
|
|
#else
|
|
callq _fe_mul_x64
|
|
#endif /* __APPLE__ */
|
|
movq 104(%rsp), %rsi
|
|
movq 96(%rsp), %rdi
|
|
addq $0x70, %rsp
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_to_p2_x64
|
|
.type fe_ge_to_p2_x64,@function
|
|
.align 4
|
|
fe_ge_to_p2_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_to_p2_x64
|
|
.p2align 2
|
|
_fe_ge_to_p2_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $40, %rsp
|
|
movq %rsi, (%rsp)
|
|
movq %rdx, 8(%rsp)
|
|
movq %rcx, 16(%rsp)
|
|
movq %r8, 24(%rsp)
|
|
movq %r9, 32(%rsp)
|
|
movq 16(%rsp), %rsi
|
|
movq 88(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 24(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 32(%rsp), %rsi
|
|
movq 88(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $40, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_to_p3_x64
|
|
.type fe_ge_to_p3_x64,@function
|
|
.align 4
|
|
fe_ge_to_p3_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_to_p3_x64
|
|
.p2align 2
|
|
_fe_ge_to_p3_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $40, %rsp
|
|
movq %rsi, (%rsp)
|
|
movq %rdx, 8(%rsp)
|
|
movq %rcx, 16(%rsp)
|
|
movq %r8, 24(%rsp)
|
|
movq %r9, 32(%rsp)
|
|
movq 24(%rsp), %rsi
|
|
movq 96(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 32(%rsp), %rsi
|
|
movq 88(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 88(%rsp), %rsi
|
|
movq 96(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq 24(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $40, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_dbl_x64
|
|
.type fe_ge_dbl_x64,@function
|
|
.align 4
|
|
fe_ge_dbl_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_dbl_x64
|
|
.p2align 2
|
|
_fe_ge_dbl_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $0x50, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq (%rsp), %rdi
|
|
movq 32(%rsp), %rsi
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rax
|
|
mulq 8(%rsi)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq (%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq (%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 8(%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 16(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rax
|
|
mulq %rax
|
|
movq %rax, %r8
|
|
movq %rdx, %rcx
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rcx, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rax
|
|
mulq 8(%rsi)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq (%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq (%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 8(%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 16(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rax
|
|
mulq %rax
|
|
movq %rax, %r8
|
|
movq %rdx, %rcx
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rcx, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
movq 128(%rsp), %rsi
|
|
# Square * 2
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rax
|
|
mulq 8(%rsi)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq (%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq (%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 8(%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 16(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rax
|
|
mulq %rax
|
|
movq %rax, %r8
|
|
movq %rdx, %rcx
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rcx, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
xorq %rax, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $3, %r15, %rax
|
|
shldq $2, %r14, %r15
|
|
shldq $2, %r13, %r14
|
|
shldq $2, %r12, %r13
|
|
shldq $2, %r11, %r12
|
|
shldq $0x01, %r10, %r11
|
|
shldq $0x01, %r9, %r10
|
|
shldq $0x01, %r8, %r9
|
|
shlq $0x01, %r8
|
|
andq %rbx, %r11
|
|
# Two out left, one in right
|
|
andq %rbx, %r15
|
|
# Multiply top bits by 19*19
|
|
imulq $0x169, %rax, %rcx
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining produce results in
|
|
addq %rcx, %r8
|
|
adcq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 32(%rsp), %rsi
|
|
movq 40(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
leaq 48(%rsp), %rdi
|
|
movq 8(%rsp), %rsi
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rax
|
|
mulq 8(%rsi)
|
|
movq %rax, %r9
|
|
movq %rdx, %r10
|
|
# A[0] * A[2]
|
|
movq (%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[0] * A[3]
|
|
movq (%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
# A[1] * A[2]
|
|
movq 8(%rsi), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
# A[2] * A[3]
|
|
movq 16(%rsi), %rax
|
|
mulq 24(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
# Double
|
|
xorq %r15, %r15
|
|
addq %r9, %r9
|
|
adcq %r10, %r10
|
|
adcq %r11, %r11
|
|
adcq %r12, %r12
|
|
adcq %r13, %r13
|
|
adcq %r14, %r14
|
|
adcq $0x00, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rax
|
|
mulq %rax
|
|
movq %rax, %r8
|
|
movq %rdx, %rcx
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r9
|
|
adcq %rax, %r10
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rax
|
|
mulq %rax
|
|
addq %rcx, %r11
|
|
adcq %rax, %r12
|
|
adcq $0x00, %rdx
|
|
movq %rdx, %rcx
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rax
|
|
mulq %rax
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
addq %rcx, %r13
|
|
adcq $0x00, %r14
|
|
adcq $0x00, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq (%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq (%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
movq 24(%rsp), %rsi
|
|
movq 16(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $0x50, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_dbl_x64,.-fe_ge_dbl_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_madd_x64
|
|
.type fe_ge_madd_x64,@function
|
|
.align 4
|
|
fe_ge_madd_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_madd_x64
|
|
.p2align 2
|
|
_fe_ge_madd_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $0x50, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq (%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq (%rsp), %rsi
|
|
movq 152(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 8(%rsp), %rsi
|
|
movq 160(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
movq 144(%rsp), %rsi
|
|
movq 136(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
leaq 48(%rsp), %rdi
|
|
movq 128(%rsp), %rsi
|
|
movq 128(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $0x50, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_madd_x64,.-fe_ge_madd_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_msub_x64
|
|
.type fe_ge_msub_x64,@function
|
|
.align 4
|
|
fe_ge_msub_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_msub_x64
|
|
.p2align 2
|
|
_fe_ge_msub_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $0x50, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq (%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq (%rsp), %rsi
|
|
movq 160(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 8(%rsp), %rsi
|
|
movq 152(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
movq 144(%rsp), %rsi
|
|
movq 136(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
leaq 48(%rsp), %rdi
|
|
movq 128(%rsp), %rsi
|
|
movq 128(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $0x50, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_msub_x64,.-fe_ge_msub_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_add_x64
|
|
.type fe_ge_add_x64,@function
|
|
.align 4
|
|
fe_ge_add_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_add_x64
|
|
.p2align 2
|
|
_fe_ge_add_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $0x50, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq (%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq (%rsp), %rsi
|
|
movq 160(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 8(%rsp), %rsi
|
|
movq 168(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
movq 152(%rsp), %rsi
|
|
movq 136(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 128(%rsp), %rsi
|
|
movq 144(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
leaq 48(%rsp), %rdi
|
|
movq (%rsp), %rsi
|
|
movq (%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $0x50, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_add_x64,.-fe_ge_add_x64
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_sub_x64
|
|
.type fe_ge_sub_x64,@function
|
|
.align 4
|
|
fe_ge_sub_x64:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_sub_x64
|
|
.p2align 2
|
|
_fe_ge_sub_x64:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $0x50, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq (%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 40(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq (%rsp), %rsi
|
|
movq 168(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 8(%rsp), %rsi
|
|
movq 160(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
movq 152(%rsp), %rsi
|
|
movq 136(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 128(%rsp), %rsi
|
|
movq 144(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq (%rsi)
|
|
movq %rax, %r8
|
|
movq %rdx, %r9
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r10, %r10
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
# A[1] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r11, %r11
|
|
addq %rax, %r9
|
|
adcq %rdx, %r10
|
|
adcq $0x00, %r11
|
|
# A[0] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq (%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r12, %r12
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[2] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r10
|
|
adcq %rdx, %r11
|
|
adcq $0x00, %r12
|
|
# A[0] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq (%rsi)
|
|
xorq %r13, %r13
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[2] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r11
|
|
adcq %rdx, %r12
|
|
adcq $0x00, %r13
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 8(%rsi)
|
|
xorq %r14, %r14
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r12
|
|
adcq %rdx, %r13
|
|
adcq $0x00, %r14
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 16(%rsi)
|
|
xorq %r15, %r15
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r13
|
|
adcq %rdx, %r14
|
|
adcq $0x00, %r15
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rax
|
|
mulq 24(%rsi)
|
|
addq %rax, %r14
|
|
adcq %rdx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rax
|
|
mulq %r12
|
|
xorq %r12, %r12
|
|
addq %rax, %r8
|
|
movq $19, %rax
|
|
adcq %rdx, %r12
|
|
mulq %r13
|
|
xorq %r13, %r13
|
|
addq %rax, %r9
|
|
movq $19, %rax
|
|
adcq %rdx, %r13
|
|
mulq %r14
|
|
xorq %r14, %r14
|
|
addq %rax, %r10
|
|
movq $19, %rax
|
|
adcq %rdx, %r14
|
|
mulq %r15
|
|
# Add remaining product results in
|
|
addq %r12, %r9
|
|
adcq %r13, %r10
|
|
adcq %r14, %r11
|
|
adcq %rax, %r11
|
|
adcq $0x00, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
leaq 48(%rsp), %rdi
|
|
movq (%rsp), %rsi
|
|
movq (%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 16(%rsp), %rsi
|
|
movq 8(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rbx), %r8
|
|
movq $0x00, %rcx
|
|
sbbq 8(%rbx), %r9
|
|
movq $-19, %rax
|
|
sbbq 16(%rbx), %r10
|
|
movq $0x7fffffffffffffff, %rdx
|
|
sbbq 24(%rbx), %r11
|
|
sbbq $0x00, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Add modulus (if underflow)
|
|
addq %rax, %r8
|
|
adcq %rcx, %r9
|
|
adcq %rcx, %r10
|
|
adcq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
leaq 48(%rsp), %rsi
|
|
movq 24(%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rcx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rax
|
|
adcq 24(%rbx), %rcx
|
|
movq $0x7fffffffffffffff, %rdx
|
|
movq %rcx, %r11
|
|
sarq $63, %rcx
|
|
# Mask the modulus
|
|
andq %rcx, %rax
|
|
andq %rcx, %rdx
|
|
# Sub modulus (if overflow)
|
|
subq %rax, %r8
|
|
sbbq %rcx, %r9
|
|
sbbq %rcx, %r10
|
|
sbbq %rdx, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $0x50, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_sub_x64,.-fe_ge_sub_x64
|
|
#endif /* __APPLE__ */
|
|
#ifdef HAVE_INTEL_AVX2
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_mul_avx2
|
|
.type fe_mul_avx2,@function
|
|
.align 4
|
|
fe_mul_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_mul_avx2
|
|
.p2align 2
|
|
_fe_mul_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbx
|
|
movq %rdx, %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rax, %rcx
|
|
xorq %r15, %r15
|
|
adcxq %rax, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rcx, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rax, %rcx
|
|
adoxq %rax, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rax, %r14
|
|
adoxq %rcx, %r10
|
|
adcxq %rax, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rax, %rcx
|
|
adcxq %r14, %r12
|
|
adoxq %rax, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rcx, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rax, %rcx
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rax, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rax
|
|
adcxq %rcx, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rax, %r11
|
|
mulxq 24(%rsi), %rax, %rcx
|
|
adcxq %rax, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rax
|
|
adcxq %rcx, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rax, %r13
|
|
mulxq 24(%rsi), %rax, %rcx
|
|
adoxq %r15, %r14
|
|
adcxq %rax, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rax
|
|
adcxq %rcx, %r15
|
|
xorq %rcx, %rcx
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rax, %r12
|
|
mulxq 24(%rsi), %rdx, %rax
|
|
adoxq %rdx, %r11
|
|
adoxq %rax, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rax
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rax, %r14
|
|
mulxq 24(%rsi), %rax, %rdx
|
|
adcxq %rcx, %r15
|
|
adoxq %rax, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rcx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r12, %rax, %r12
|
|
adcxq %rax, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
popq %rbx
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_mul_avx2,.-fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq_avx2
|
|
.type fe_sq_avx2,@function
|
|
.align 4
|
|
fe_sq_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq_avx2
|
|
.p2align 2
|
|
_fe_sq_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rdx
|
|
mulxq 8(%rsi), %r9, %r10
|
|
# A[0] * A[3]
|
|
mulxq 24(%rsi), %r11, %r12
|
|
# A[2] * A[1]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 8(%rsi), %rcx, %rbx
|
|
xorq %r15, %r15
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[3]
|
|
mulxq 24(%rsi), %r13, %r14
|
|
adoxq %rbx, %r12
|
|
# A[2] * A[0]
|
|
mulxq (%rsi), %rcx, %rbx
|
|
adoxq %r15, %r13
|
|
adcxq %rcx, %r10
|
|
adoxq %r15, %r14
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rdx
|
|
mulxq 24(%rsi), %rax, %r8
|
|
adcxq %rbx, %r11
|
|
adcxq %rax, %r12
|
|
adcxq %r8, %r13
|
|
adcxq %r15, %r14
|
|
# Double with Carry Flag
|
|
xorq %r15, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rdx
|
|
mulxq %rdx, %r8, %rax
|
|
adcxq %r9, %r9
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq %rdx, %rcx, %rbx
|
|
adcxq %r10, %r10
|
|
adoxq %rax, %r9
|
|
adcxq %r11, %r11
|
|
adoxq %rcx, %r10
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq %rdx, %rax, %rcx
|
|
adcxq %r12, %r12
|
|
adoxq %rbx, %r11
|
|
adcxq %r13, %r13
|
|
adoxq %rax, %r12
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq %rdx, %rax, %rbx
|
|
adcxq %r14, %r14
|
|
adoxq %rcx, %r13
|
|
adcxq %r15, %r15
|
|
adoxq %rax, %r14
|
|
adoxq %rbx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r12, %rax, %r12
|
|
adcxq %rax, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_sq_avx2,.-fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq_n_avx2
|
|
.type fe_sq_n_avx2,@function
|
|
.align 4
|
|
fe_sq_n_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq_n_avx2
|
|
.p2align 2
|
|
_fe_sq_n_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbp
|
|
movq %rdx, %rbp
|
|
L_fe_sq_n_avx2:
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rdx
|
|
mulxq 8(%rsi), %r9, %r10
|
|
# A[0] * A[3]
|
|
mulxq 24(%rsi), %r11, %r12
|
|
# A[2] * A[1]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 8(%rsi), %rcx, %rbx
|
|
xorq %r15, %r15
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[3]
|
|
mulxq 24(%rsi), %r13, %r14
|
|
adoxq %rbx, %r12
|
|
# A[2] * A[0]
|
|
mulxq (%rsi), %rcx, %rbx
|
|
adoxq %r15, %r13
|
|
adcxq %rcx, %r10
|
|
adoxq %r15, %r14
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rdx
|
|
mulxq 24(%rsi), %rax, %r8
|
|
adcxq %rbx, %r11
|
|
adcxq %rax, %r12
|
|
adcxq %r8, %r13
|
|
adcxq %r15, %r14
|
|
# Double with Carry Flag
|
|
xorq %r15, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rdx
|
|
mulxq %rdx, %r8, %rax
|
|
adcxq %r9, %r9
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq %rdx, %rcx, %rbx
|
|
adcxq %r10, %r10
|
|
adoxq %rax, %r9
|
|
adcxq %r11, %r11
|
|
adoxq %rcx, %r10
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq %rdx, %rax, %rcx
|
|
adcxq %r12, %r12
|
|
adoxq %rbx, %r11
|
|
adcxq %r13, %r13
|
|
adoxq %rax, %r12
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq %rdx, %rax, %rbx
|
|
adcxq %r14, %r14
|
|
adoxq %rcx, %r13
|
|
adcxq %r15, %r15
|
|
adoxq %rax, %r14
|
|
adoxq %rbx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r12, %rax, %r12
|
|
adcxq %rax, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
decb %bpl
|
|
jnz L_fe_sq_n_avx2
|
|
popq %rbp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_sq_n_avx2,.-fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_mul121666_avx2
|
|
.type fe_mul121666_avx2,@function
|
|
.align 4
|
|
fe_mul121666_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_mul121666_avx2
|
|
.p2align 2
|
|
_fe_mul121666_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %r12
|
|
pushq %r13
|
|
movq $0x1db42, %rdx
|
|
mulxq (%rsi), %rax, %r13
|
|
mulxq 8(%rsi), %rcx, %r12
|
|
mulxq 16(%rsi), %r8, %r11
|
|
mulxq 24(%rsi), %r9, %r10
|
|
addq %r13, %rcx
|
|
adcq %r12, %r8
|
|
adcq %r11, %r9
|
|
adcq $0x00, %r10
|
|
movq $0x7fffffffffffffff, %r13
|
|
shldq $0x01, %r9, %r10
|
|
andq %r13, %r9
|
|
imulq $19, %r10, %r10
|
|
addq %r10, %rax
|
|
adcq $0x00, %rcx
|
|
adcq $0x00, %r8
|
|
adcq $0x00, %r9
|
|
movq %rax, (%rdi)
|
|
movq %rcx, 8(%rdi)
|
|
movq %r8, 16(%rdi)
|
|
movq %r9, 24(%rdi)
|
|
popq %r13
|
|
popq %r12
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_mul121666_avx2,.-fe_mul121666_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_sq2_avx2
|
|
.type fe_sq2_avx2,@function
|
|
.align 4
|
|
fe_sq2_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_sq2_avx2
|
|
.p2align 2
|
|
_fe_sq2_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
# Square * 2
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rdx
|
|
mulxq 8(%rsi), %r9, %r10
|
|
# A[0] * A[3]
|
|
mulxq 24(%rsi), %r11, %r12
|
|
# A[2] * A[1]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 8(%rsi), %rcx, %rbx
|
|
xorq %r15, %r15
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[3]
|
|
mulxq 24(%rsi), %r13, %r14
|
|
adoxq %rbx, %r12
|
|
# A[2] * A[0]
|
|
mulxq (%rsi), %rcx, %rbx
|
|
adoxq %r15, %r13
|
|
adcxq %rcx, %r10
|
|
adoxq %r15, %r14
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rdx
|
|
mulxq 24(%rsi), %rax, %r8
|
|
adcxq %rbx, %r11
|
|
adcxq %rax, %r12
|
|
adcxq %r8, %r13
|
|
adcxq %r15, %r14
|
|
# Double with Carry Flag
|
|
xorq %r15, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rdx
|
|
mulxq %rdx, %r8, %rax
|
|
adcxq %r9, %r9
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq %rdx, %rcx, %rbx
|
|
adcxq %r10, %r10
|
|
adoxq %rax, %r9
|
|
adcxq %r11, %r11
|
|
adoxq %rcx, %r10
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq %rdx, %rax, %rcx
|
|
adcxq %r12, %r12
|
|
adoxq %rbx, %r11
|
|
adcxq %r13, %r13
|
|
adoxq %rax, %r12
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq %rdx, %rax, %rbx
|
|
adcxq %r14, %r14
|
|
adoxq %rcx, %r13
|
|
adcxq %r15, %r15
|
|
adoxq %rax, %r14
|
|
adoxq %rbx, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
xorq %rax, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3 and double
|
|
shldq $3, %r15, %rax
|
|
shldq $2, %r14, %r15
|
|
shldq $2, %r13, %r14
|
|
shldq $2, %r12, %r13
|
|
shldq $2, %r11, %r12
|
|
shldq $0x01, %r10, %r11
|
|
shldq $0x01, %r9, %r10
|
|
shldq $0x01, %r8, %r9
|
|
shlq $0x01, %r8
|
|
andq %rbx, %r11
|
|
# Two out left, one in right
|
|
andq %rbx, %r15
|
|
# Multiply top bits by 19*19
|
|
imulq $0x169, %rax, %rcx
|
|
xorq %rbx, %rbx
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
adoxq %rcx, %r8
|
|
mulxq %r12, %rax, %r12
|
|
adcxq %rax, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rbx, %rdx
|
|
adcxq %rbx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rbx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rbx, %r11
|
|
addq %rax, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_sq2_avx2,.-fe_sq2_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_invert_avx2
|
|
.type fe_invert_avx2,@function
|
|
.align 4
|
|
fe_invert_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_invert_avx2
|
|
.p2align 2
|
|
_fe_invert_avx2:
|
|
#endif /* __APPLE__ */
|
|
subq $0x90, %rsp
|
|
# Invert
|
|
movq %rdi, 128(%rsp)
|
|
movq %rsi, 136(%rsp)
|
|
movq %rsp, %rdi
|
|
movq 136(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq 136(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $19, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $0x63, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
movq 128(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
movq 136(%rsp), %rsi
|
|
movq 128(%rsp), %rdi
|
|
addq $0x90, %rsp
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl curve25519_avx2
|
|
.type curve25519_avx2,@function
|
|
.align 4
|
|
curve25519_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _curve25519_avx2
|
|
.p2align 2
|
|
_curve25519_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
pushq %rbp
|
|
movq %rdx, %r8
|
|
subq $0xc0, %rsp
|
|
movq $0x00, 184(%rsp)
|
|
movq %rdi, 176(%rsp)
|
|
# Set one
|
|
movq $0x01, (%rdi)
|
|
movq $0x00, 8(%rdi)
|
|
movq $0x00, 16(%rdi)
|
|
movq $0x00, 24(%rdi)
|
|
# Set zero
|
|
movq $0x00, (%rsp)
|
|
movq $0x00, 8(%rsp)
|
|
movq $0x00, 16(%rsp)
|
|
movq $0x00, 24(%rsp)
|
|
# Set one
|
|
movq $0x01, 32(%rsp)
|
|
movq $0x00, 40(%rsp)
|
|
movq $0x00, 48(%rsp)
|
|
movq $0x00, 56(%rsp)
|
|
# Copy
|
|
movq (%r8), %r9
|
|
movq 8(%r8), %r10
|
|
movq 16(%r8), %r11
|
|
movq 24(%r8), %r12
|
|
movq %r9, 64(%rsp)
|
|
movq %r10, 72(%rsp)
|
|
movq %r11, 80(%rsp)
|
|
movq %r12, 88(%rsp)
|
|
movb $62, 168(%rsp)
|
|
movq $3, 160(%rsp)
|
|
L_curve25519_avx2_words:
|
|
L_curve25519_avx2_bits:
|
|
movq 184(%rsp), %rbx
|
|
movq 160(%rsp), %r9
|
|
movb 168(%rsp), %cl
|
|
movq (%rsi,%r9,8), %rax
|
|
shrq %cl, %rax
|
|
andq $0x01, %rax
|
|
xorq %rax, %rbx
|
|
negq %rbx
|
|
# Conditional Swap
|
|
movq (%rdi), %r9
|
|
movq 8(%rdi), %r10
|
|
movq 16(%rdi), %r11
|
|
movq 24(%rdi), %r12
|
|
xorq 64(%rsp), %r9
|
|
xorq 72(%rsp), %r10
|
|
xorq 80(%rsp), %r11
|
|
xorq 88(%rsp), %r12
|
|
andq %rbx, %r9
|
|
andq %rbx, %r10
|
|
andq %rbx, %r11
|
|
andq %rbx, %r12
|
|
xorq %r9, (%rdi)
|
|
xorq %r10, 8(%rdi)
|
|
xorq %r11, 16(%rdi)
|
|
xorq %r12, 24(%rdi)
|
|
xorq %r9, 64(%rsp)
|
|
xorq %r10, 72(%rsp)
|
|
xorq %r11, 80(%rsp)
|
|
xorq %r12, 88(%rsp)
|
|
# Conditional Swap
|
|
movq (%rsp), %r9
|
|
movq 8(%rsp), %r10
|
|
movq 16(%rsp), %r11
|
|
movq 24(%rsp), %r12
|
|
xorq 32(%rsp), %r9
|
|
xorq 40(%rsp), %r10
|
|
xorq 48(%rsp), %r11
|
|
xorq 56(%rsp), %r12
|
|
andq %rbx, %r9
|
|
andq %rbx, %r10
|
|
andq %rbx, %r11
|
|
andq %rbx, %r12
|
|
xorq %r9, (%rsp)
|
|
xorq %r10, 8(%rsp)
|
|
xorq %r11, 16(%rsp)
|
|
xorq %r12, 24(%rsp)
|
|
xorq %r9, 32(%rsp)
|
|
xorq %r10, 40(%rsp)
|
|
xorq %r11, 48(%rsp)
|
|
xorq %r12, 56(%rsp)
|
|
movq %rax, 184(%rsp)
|
|
# Add
|
|
movq (%rdi), %r9
|
|
movq 8(%rdi), %r10
|
|
movq 16(%rdi), %r11
|
|
movq 24(%rdi), %rax
|
|
movq %r9, %r13
|
|
addq (%rsp), %r9
|
|
movq %r10, %r14
|
|
adcq 8(%rsp), %r10
|
|
movq %r11, %r15
|
|
adcq 16(%rsp), %r11
|
|
movq %rax, %rbp
|
|
adcq 24(%rsp), %rax
|
|
movq $-19, %rcx
|
|
movq %rax, %r12
|
|
movq $0x7fffffffffffffff, %rbx
|
|
sarq $63, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r9
|
|
sbbq %rax, %r10
|
|
sbbq %rax, %r11
|
|
sbbq %rbx, %r12
|
|
# Sub
|
|
subq (%rsp), %r13
|
|
movq $0x00, %rax
|
|
sbbq 8(%rsp), %r14
|
|
movq $-19, %rcx
|
|
sbbq 16(%rsp), %r15
|
|
movq $0x7fffffffffffffff, %rbx
|
|
sbbq 24(%rsp), %rbp
|
|
sbbq $0x00, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r13
|
|
adcq %rax, %r14
|
|
adcq %rax, %r15
|
|
adcq %rbx, %rbp
|
|
movq %r9, (%rdi)
|
|
movq %r10, 8(%rdi)
|
|
movq %r11, 16(%rdi)
|
|
movq %r12, 24(%rdi)
|
|
movq %r13, 128(%rsp)
|
|
movq %r14, 136(%rsp)
|
|
movq %r15, 144(%rsp)
|
|
movq %rbp, 152(%rsp)
|
|
# Add
|
|
movq 64(%rsp), %r9
|
|
movq 72(%rsp), %r10
|
|
movq 80(%rsp), %r11
|
|
movq 88(%rsp), %rax
|
|
movq %r9, %r13
|
|
addq 32(%rsp), %r9
|
|
movq %r10, %r14
|
|
adcq 40(%rsp), %r10
|
|
movq %r11, %r15
|
|
adcq 48(%rsp), %r11
|
|
movq %rax, %rbp
|
|
adcq 56(%rsp), %rax
|
|
movq $-19, %rcx
|
|
movq %rax, %r12
|
|
movq $0x7fffffffffffffff, %rbx
|
|
sarq $63, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r9
|
|
sbbq %rax, %r10
|
|
sbbq %rax, %r11
|
|
sbbq %rbx, %r12
|
|
# Sub
|
|
subq 32(%rsp), %r13
|
|
movq $0x00, %rax
|
|
sbbq 40(%rsp), %r14
|
|
movq $-19, %rcx
|
|
sbbq 48(%rsp), %r15
|
|
movq $0x7fffffffffffffff, %rbx
|
|
sbbq 56(%rsp), %rbp
|
|
sbbq $0x00, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r13
|
|
adcq %rax, %r14
|
|
adcq %rax, %r15
|
|
adcq %rbx, %rbp
|
|
movq %r9, (%rsp)
|
|
movq %r10, 8(%rsp)
|
|
movq %r11, 16(%rsp)
|
|
movq %r12, 24(%rsp)
|
|
movq %r13, 96(%rsp)
|
|
movq %r14, 104(%rsp)
|
|
movq %r15, 112(%rsp)
|
|
movq %rbp, 120(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rdi), %rdx
|
|
mulxq 96(%rsp), %r9, %r10
|
|
# A[2] * B[0]
|
|
mulxq 112(%rsp), %r11, %r12
|
|
# A[1] * B[0]
|
|
mulxq 104(%rsp), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq 104(%rsp), %r13, %r14
|
|
adcxq %rbx, %r11
|
|
# A[0] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq 96(%rsp), %rcx, %rbx
|
|
adoxq %rcx, %r10
|
|
# A[2] * B[1]
|
|
mulxq 112(%rsp), %rcx, %r15
|
|
adoxq %rbx, %r11
|
|
adcxq %rcx, %r12
|
|
# A[1] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 104(%rsp), %rcx, %rbx
|
|
adcxq %r15, %r13
|
|
adoxq %rcx, %r12
|
|
adcxq %rbp, %r14
|
|
adoxq %rbx, %r13
|
|
# A[0] * B[2]
|
|
mulxq 96(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq 104(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r12
|
|
adoxq %rdx, %r11
|
|
# A[3] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
adoxq %rcx, %r12
|
|
mulxq 120(%rsp), %rcx, %rbx
|
|
adcxq %rcx, %r13
|
|
# A[2] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 112(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r14
|
|
adoxq %rdx, %r13
|
|
# A[3] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
adoxq %rcx, %r14
|
|
mulxq 120(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r15
|
|
adcxq %rcx, %r15
|
|
# A[0] * B[3]
|
|
mulxq 96(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %rbp
|
|
xorq %rbx, %rbx
|
|
adcxq %rdx, %r12
|
|
# A[3] * B[0]
|
|
movq (%rdi), %rdx
|
|
adcxq %rcx, %r13
|
|
mulxq 120(%rsp), %rdx, %rcx
|
|
adoxq %rdx, %r12
|
|
adoxq %rcx, %r13
|
|
# A[2] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq 112(%rsp), %rdx, %rcx
|
|
adcxq %rdx, %r14
|
|
# A[3] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
adcxq %rcx, %r15
|
|
mulxq 120(%rsp), %rcx, %rdx
|
|
adcxq %rbx, %rbp
|
|
adoxq %rcx, %r14
|
|
adoxq %rdx, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rbx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rbx, %rbx
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rcx, %r15
|
|
adcxq %rcx, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rbx, %rdx
|
|
adcxq %rbx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rbx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, 32(%rsp)
|
|
movq %r10, 40(%rsp)
|
|
movq %r11, 48(%rsp)
|
|
movq %r12, 56(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq 128(%rsp), %rdx
|
|
mulxq (%rsp), %r9, %r10
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsp), %r11, %r12
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsp), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[3]
|
|
movq 152(%rsp), %rdx
|
|
mulxq 8(%rsp), %r13, %r14
|
|
adcxq %rbx, %r11
|
|
# A[0] * B[1]
|
|
movq 136(%rsp), %rdx
|
|
mulxq (%rsp), %rcx, %rbx
|
|
adoxq %rcx, %r10
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsp), %rcx, %r15
|
|
adoxq %rbx, %r11
|
|
adcxq %rcx, %r12
|
|
# A[1] * B[2]
|
|
movq 144(%rsp), %rdx
|
|
mulxq 8(%rsp), %rcx, %rbx
|
|
adcxq %r15, %r13
|
|
adoxq %rcx, %r12
|
|
adcxq %rbp, %r14
|
|
adoxq %rbx, %r13
|
|
# A[0] * B[2]
|
|
mulxq (%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[1]
|
|
movq 136(%rsp), %rdx
|
|
mulxq 8(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r12
|
|
adoxq %rdx, %r11
|
|
# A[3] * B[1]
|
|
movq 136(%rsp), %rdx
|
|
adoxq %rcx, %r12
|
|
mulxq 24(%rsp), %rcx, %rbx
|
|
adcxq %rcx, %r13
|
|
# A[2] * B[2]
|
|
movq 144(%rsp), %rdx
|
|
mulxq 16(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r14
|
|
adoxq %rdx, %r13
|
|
# A[3] * B[3]
|
|
movq 152(%rsp), %rdx
|
|
adoxq %rcx, %r14
|
|
mulxq 24(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r15
|
|
adcxq %rcx, %r15
|
|
# A[0] * B[3]
|
|
mulxq (%rsp), %rdx, %rcx
|
|
adcxq %rbx, %rbp
|
|
xorq %rbx, %rbx
|
|
adcxq %rdx, %r12
|
|
# A[3] * B[0]
|
|
movq 128(%rsp), %rdx
|
|
adcxq %rcx, %r13
|
|
mulxq 24(%rsp), %rdx, %rcx
|
|
adoxq %rdx, %r12
|
|
adoxq %rcx, %r13
|
|
# A[2] * B[3]
|
|
movq 152(%rsp), %rdx
|
|
mulxq 16(%rsp), %rdx, %rcx
|
|
adcxq %rdx, %r14
|
|
# A[3] * B[2]
|
|
movq 144(%rsp), %rdx
|
|
adcxq %rcx, %r15
|
|
mulxq 24(%rsp), %rcx, %rdx
|
|
adcxq %rbx, %rbp
|
|
adoxq %rcx, %r14
|
|
adoxq %rdx, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rbx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rbx, %rbx
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rcx, %r15
|
|
adcxq %rcx, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rbx, %rdx
|
|
adcxq %rbx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rbx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, (%rsp)
|
|
movq %r10, 8(%rsp)
|
|
movq %r11, 16(%rsp)
|
|
movq %r12, 24(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq 128(%rsp), %rdx
|
|
mulxq 136(%rsp), %r10, %r11
|
|
# A[0] * A[3]
|
|
mulxq 152(%rsp), %r12, %r13
|
|
# A[2] * A[1]
|
|
movq 144(%rsp), %rdx
|
|
mulxq 136(%rsp), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adoxq %rcx, %r12
|
|
# A[2] * A[3]
|
|
mulxq 152(%rsp), %r14, %r15
|
|
adoxq %rbx, %r13
|
|
# A[2] * A[0]
|
|
mulxq 128(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
adcxq %rcx, %r11
|
|
adoxq %rbp, %r15
|
|
# A[1] * A[3]
|
|
movq 136(%rsp), %rdx
|
|
mulxq 152(%rsp), %rax, %r9
|
|
adcxq %rbx, %r12
|
|
adcxq %rax, %r13
|
|
adcxq %r9, %r14
|
|
adcxq %rbp, %r15
|
|
# Double with Carry Flag
|
|
xorq %rbp, %rbp
|
|
# A[0] * A[0]
|
|
movq 128(%rsp), %rdx
|
|
mulxq %rdx, %r9, %rax
|
|
adcxq %r10, %r10
|
|
# A[1] * A[1]
|
|
movq 136(%rsp), %rdx
|
|
mulxq %rdx, %rcx, %rbx
|
|
adcxq %r11, %r11
|
|
adoxq %rax, %r10
|
|
adcxq %r12, %r12
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[2]
|
|
movq 144(%rsp), %rdx
|
|
mulxq %rdx, %rax, %rcx
|
|
adcxq %r13, %r13
|
|
adoxq %rbx, %r12
|
|
adcxq %r14, %r14
|
|
adoxq %rax, %r13
|
|
# A[3] * A[3]
|
|
movq 152(%rsp), %rdx
|
|
mulxq %rdx, %rax, %rbx
|
|
adcxq %r15, %r15
|
|
adoxq %rcx, %r14
|
|
adcxq %rbp, %rbp
|
|
adoxq %rax, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rcx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rax, %r15
|
|
adcxq %rax, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, 96(%rsp)
|
|
movq %r10, 104(%rsp)
|
|
movq %r11, 112(%rsp)
|
|
movq %r12, 120(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rdi), %rdx
|
|
mulxq 8(%rdi), %r10, %r11
|
|
# A[0] * A[3]
|
|
mulxq 24(%rdi), %r12, %r13
|
|
# A[2] * A[1]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 8(%rdi), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adoxq %rcx, %r12
|
|
# A[2] * A[3]
|
|
mulxq 24(%rdi), %r14, %r15
|
|
adoxq %rbx, %r13
|
|
# A[2] * A[0]
|
|
mulxq (%rdi), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
adcxq %rcx, %r11
|
|
adoxq %rbp, %r15
|
|
# A[1] * A[3]
|
|
movq 8(%rdi), %rdx
|
|
mulxq 24(%rdi), %rax, %r9
|
|
adcxq %rbx, %r12
|
|
adcxq %rax, %r13
|
|
adcxq %r9, %r14
|
|
adcxq %rbp, %r15
|
|
# Double with Carry Flag
|
|
xorq %rbp, %rbp
|
|
# A[0] * A[0]
|
|
movq (%rdi), %rdx
|
|
mulxq %rdx, %r9, %rax
|
|
adcxq %r10, %r10
|
|
# A[1] * A[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq %rdx, %rcx, %rbx
|
|
adcxq %r11, %r11
|
|
adoxq %rax, %r10
|
|
adcxq %r12, %r12
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq %rdx, %rax, %rcx
|
|
adcxq %r13, %r13
|
|
adoxq %rbx, %r12
|
|
adcxq %r14, %r14
|
|
adoxq %rax, %r13
|
|
# A[3] * A[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq %rdx, %rax, %rbx
|
|
adcxq %r15, %r15
|
|
adoxq %rcx, %r14
|
|
adcxq %rbp, %rbp
|
|
adoxq %rax, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rcx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rax, %r15
|
|
adcxq %rax, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, 128(%rsp)
|
|
movq %r10, 136(%rsp)
|
|
movq %r11, 144(%rsp)
|
|
movq %r12, 152(%rsp)
|
|
# Add
|
|
movq 32(%rsp), %r9
|
|
movq 40(%rsp), %r10
|
|
movq 48(%rsp), %r11
|
|
movq 56(%rsp), %rax
|
|
movq %r9, %r13
|
|
addq (%rsp), %r9
|
|
movq %r10, %r14
|
|
adcq 8(%rsp), %r10
|
|
movq %r11, %r15
|
|
adcq 16(%rsp), %r11
|
|
movq %rax, %rbp
|
|
adcq 24(%rsp), %rax
|
|
movq $-19, %rcx
|
|
movq %rax, %r12
|
|
movq $0x7fffffffffffffff, %rbx
|
|
sarq $63, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r9
|
|
sbbq %rax, %r10
|
|
sbbq %rax, %r11
|
|
sbbq %rbx, %r12
|
|
# Sub
|
|
subq (%rsp), %r13
|
|
movq $0x00, %rax
|
|
sbbq 8(%rsp), %r14
|
|
movq $-19, %rcx
|
|
sbbq 16(%rsp), %r15
|
|
movq $0x7fffffffffffffff, %rbx
|
|
sbbq 24(%rsp), %rbp
|
|
sbbq $0x00, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r13
|
|
adcq %rax, %r14
|
|
adcq %rax, %r15
|
|
adcq %rbx, %rbp
|
|
movq %r9, 64(%rsp)
|
|
movq %r10, 72(%rsp)
|
|
movq %r11, 80(%rsp)
|
|
movq %r12, 88(%rsp)
|
|
movq %r13, (%rsp)
|
|
movq %r14, 8(%rsp)
|
|
movq %r15, 16(%rsp)
|
|
movq %rbp, 24(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq 96(%rsp), %rdx
|
|
mulxq 128(%rsp), %r9, %r10
|
|
# A[2] * B[0]
|
|
mulxq 144(%rsp), %r11, %r12
|
|
# A[1] * B[0]
|
|
mulxq 136(%rsp), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[3]
|
|
movq 120(%rsp), %rdx
|
|
mulxq 136(%rsp), %r13, %r14
|
|
adcxq %rbx, %r11
|
|
# A[0] * B[1]
|
|
movq 104(%rsp), %rdx
|
|
mulxq 128(%rsp), %rcx, %rbx
|
|
adoxq %rcx, %r10
|
|
# A[2] * B[1]
|
|
mulxq 144(%rsp), %rcx, %r15
|
|
adoxq %rbx, %r11
|
|
adcxq %rcx, %r12
|
|
# A[1] * B[2]
|
|
movq 112(%rsp), %rdx
|
|
mulxq 136(%rsp), %rcx, %rbx
|
|
adcxq %r15, %r13
|
|
adoxq %rcx, %r12
|
|
adcxq %rbp, %r14
|
|
adoxq %rbx, %r13
|
|
# A[0] * B[2]
|
|
mulxq 128(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[1]
|
|
movq 104(%rsp), %rdx
|
|
mulxq 136(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r12
|
|
adoxq %rdx, %r11
|
|
# A[3] * B[1]
|
|
movq 104(%rsp), %rdx
|
|
adoxq %rcx, %r12
|
|
mulxq 152(%rsp), %rcx, %rbx
|
|
adcxq %rcx, %r13
|
|
# A[2] * B[2]
|
|
movq 112(%rsp), %rdx
|
|
mulxq 144(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r14
|
|
adoxq %rdx, %r13
|
|
# A[3] * B[3]
|
|
movq 120(%rsp), %rdx
|
|
adoxq %rcx, %r14
|
|
mulxq 152(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r15
|
|
adcxq %rcx, %r15
|
|
# A[0] * B[3]
|
|
mulxq 128(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %rbp
|
|
xorq %rbx, %rbx
|
|
adcxq %rdx, %r12
|
|
# A[3] * B[0]
|
|
movq 96(%rsp), %rdx
|
|
adcxq %rcx, %r13
|
|
mulxq 152(%rsp), %rdx, %rcx
|
|
adoxq %rdx, %r12
|
|
adoxq %rcx, %r13
|
|
# A[2] * B[3]
|
|
movq 120(%rsp), %rdx
|
|
mulxq 144(%rsp), %rdx, %rcx
|
|
adcxq %rdx, %r14
|
|
# A[3] * B[2]
|
|
movq 112(%rsp), %rdx
|
|
adcxq %rcx, %r15
|
|
mulxq 152(%rsp), %rcx, %rdx
|
|
adcxq %rbx, %rbp
|
|
adoxq %rcx, %r14
|
|
adoxq %rdx, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rbx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rbx, %rbx
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rcx, %r15
|
|
adcxq %rcx, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rbx, %rdx
|
|
adcxq %rbx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rbx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, (%rdi)
|
|
movq %r10, 8(%rdi)
|
|
movq %r11, 16(%rdi)
|
|
movq %r12, 24(%rdi)
|
|
# Sub
|
|
movq 128(%rsp), %r9
|
|
movq 136(%rsp), %r10
|
|
movq 144(%rsp), %r11
|
|
movq 152(%rsp), %r12
|
|
subq 96(%rsp), %r9
|
|
movq $0x00, %rax
|
|
sbbq 104(%rsp), %r10
|
|
movq $-19, %rcx
|
|
sbbq 112(%rsp), %r11
|
|
movq $0x7fffffffffffffff, %rbx
|
|
sbbq 120(%rsp), %r12
|
|
sbbq $0x00, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r9
|
|
adcq %rax, %r10
|
|
adcq %rax, %r11
|
|
adcq %rbx, %r12
|
|
movq %r9, 128(%rsp)
|
|
movq %r10, 136(%rsp)
|
|
movq %r11, 144(%rsp)
|
|
movq %r12, 152(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsp), %rdx
|
|
mulxq 8(%rsp), %r10, %r11
|
|
# A[0] * A[3]
|
|
mulxq 24(%rsp), %r12, %r13
|
|
# A[2] * A[1]
|
|
movq 16(%rsp), %rdx
|
|
mulxq 8(%rsp), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adoxq %rcx, %r12
|
|
# A[2] * A[3]
|
|
mulxq 24(%rsp), %r14, %r15
|
|
adoxq %rbx, %r13
|
|
# A[2] * A[0]
|
|
mulxq (%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
adcxq %rcx, %r11
|
|
adoxq %rbp, %r15
|
|
# A[1] * A[3]
|
|
movq 8(%rsp), %rdx
|
|
mulxq 24(%rsp), %rax, %r9
|
|
adcxq %rbx, %r12
|
|
adcxq %rax, %r13
|
|
adcxq %r9, %r14
|
|
adcxq %rbp, %r15
|
|
# Double with Carry Flag
|
|
xorq %rbp, %rbp
|
|
# A[0] * A[0]
|
|
movq (%rsp), %rdx
|
|
mulxq %rdx, %r9, %rax
|
|
adcxq %r10, %r10
|
|
# A[1] * A[1]
|
|
movq 8(%rsp), %rdx
|
|
mulxq %rdx, %rcx, %rbx
|
|
adcxq %r11, %r11
|
|
adoxq %rax, %r10
|
|
adcxq %r12, %r12
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[2]
|
|
movq 16(%rsp), %rdx
|
|
mulxq %rdx, %rax, %rcx
|
|
adcxq %r13, %r13
|
|
adoxq %rbx, %r12
|
|
adcxq %r14, %r14
|
|
adoxq %rax, %r13
|
|
# A[3] * A[3]
|
|
movq 24(%rsp), %rdx
|
|
mulxq %rdx, %rax, %rbx
|
|
adcxq %r15, %r15
|
|
adoxq %rcx, %r14
|
|
adcxq %rbp, %rbp
|
|
adoxq %rax, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rcx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rax, %r15
|
|
adcxq %rax, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, (%rsp)
|
|
movq %r10, 8(%rsp)
|
|
movq %r11, 16(%rsp)
|
|
movq %r12, 24(%rsp)
|
|
movq $0x1db42, %rdx
|
|
mulxq 128(%rsp), %r9, %rbp
|
|
mulxq 136(%rsp), %r10, %r15
|
|
mulxq 144(%rsp), %r11, %r14
|
|
mulxq 152(%rsp), %r12, %r13
|
|
addq %rbp, %r10
|
|
adcq %r15, %r11
|
|
adcq %r14, %r12
|
|
adcq $0x00, %r13
|
|
movq $0x7fffffffffffffff, %rbp
|
|
shldq $0x01, %r12, %r13
|
|
andq %rbp, %r12
|
|
imulq $19, %r13, %r13
|
|
addq %r13, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
movq %r9, 32(%rsp)
|
|
movq %r10, 40(%rsp)
|
|
movq %r11, 48(%rsp)
|
|
movq %r12, 56(%rsp)
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq 64(%rsp), %rdx
|
|
mulxq 72(%rsp), %r10, %r11
|
|
# A[0] * A[3]
|
|
mulxq 88(%rsp), %r12, %r13
|
|
# A[2] * A[1]
|
|
movq 80(%rsp), %rdx
|
|
mulxq 72(%rsp), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adoxq %rcx, %r12
|
|
# A[2] * A[3]
|
|
mulxq 88(%rsp), %r14, %r15
|
|
adoxq %rbx, %r13
|
|
# A[2] * A[0]
|
|
mulxq 64(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
adcxq %rcx, %r11
|
|
adoxq %rbp, %r15
|
|
# A[1] * A[3]
|
|
movq 72(%rsp), %rdx
|
|
mulxq 88(%rsp), %rax, %r9
|
|
adcxq %rbx, %r12
|
|
adcxq %rax, %r13
|
|
adcxq %r9, %r14
|
|
adcxq %rbp, %r15
|
|
# Double with Carry Flag
|
|
xorq %rbp, %rbp
|
|
# A[0] * A[0]
|
|
movq 64(%rsp), %rdx
|
|
mulxq %rdx, %r9, %rax
|
|
adcxq %r10, %r10
|
|
# A[1] * A[1]
|
|
movq 72(%rsp), %rdx
|
|
mulxq %rdx, %rcx, %rbx
|
|
adcxq %r11, %r11
|
|
adoxq %rax, %r10
|
|
adcxq %r12, %r12
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[2]
|
|
movq 80(%rsp), %rdx
|
|
mulxq %rdx, %rax, %rcx
|
|
adcxq %r13, %r13
|
|
adoxq %rbx, %r12
|
|
adcxq %r14, %r14
|
|
adoxq %rax, %r13
|
|
# A[3] * A[3]
|
|
movq 88(%rsp), %rdx
|
|
mulxq %rdx, %rax, %rbx
|
|
adcxq %r15, %r15
|
|
adoxq %rcx, %r14
|
|
adcxq %rbp, %rbp
|
|
adoxq %rax, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rcx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r13, %rax, %r13
|
|
adcxq %rax, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rax, %r14
|
|
adcxq %rax, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rax, %r15
|
|
adcxq %rax, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rax
|
|
andq %rcx, %r12
|
|
addq %rax, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, 64(%rsp)
|
|
movq %r10, 72(%rsp)
|
|
movq %r11, 80(%rsp)
|
|
movq %r12, 88(%rsp)
|
|
# Add
|
|
movq 96(%rsp), %r9
|
|
movq 104(%rsp), %r10
|
|
addq 32(%rsp), %r9
|
|
movq 112(%rsp), %r11
|
|
adcq 40(%rsp), %r10
|
|
movq 120(%rsp), %rax
|
|
adcq 48(%rsp), %r11
|
|
movq $-19, %rcx
|
|
adcq 56(%rsp), %rax
|
|
movq $0x7fffffffffffffff, %rbx
|
|
movq %rax, %r12
|
|
sarq $63, %rax
|
|
# Mask the modulus
|
|
andq %rax, %rcx
|
|
andq %rax, %rbx
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r9
|
|
sbbq %rax, %r10
|
|
sbbq %rax, %r11
|
|
sbbq %rbx, %r12
|
|
movq %r9, 96(%rsp)
|
|
movq %r10, 104(%rsp)
|
|
movq %r11, 112(%rsp)
|
|
movq %r12, 120(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rsp), %rdx
|
|
mulxq (%r8), %r9, %r10
|
|
# A[2] * B[0]
|
|
mulxq 16(%r8), %r11, %r12
|
|
# A[1] * B[0]
|
|
mulxq 8(%r8), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[3]
|
|
movq 24(%rsp), %rdx
|
|
mulxq 8(%r8), %r13, %r14
|
|
adcxq %rbx, %r11
|
|
# A[0] * B[1]
|
|
movq 8(%rsp), %rdx
|
|
mulxq (%r8), %rcx, %rbx
|
|
adoxq %rcx, %r10
|
|
# A[2] * B[1]
|
|
mulxq 16(%r8), %rcx, %r15
|
|
adoxq %rbx, %r11
|
|
adcxq %rcx, %r12
|
|
# A[1] * B[2]
|
|
movq 16(%rsp), %rdx
|
|
mulxq 8(%r8), %rcx, %rbx
|
|
adcxq %r15, %r13
|
|
adoxq %rcx, %r12
|
|
adcxq %rbp, %r14
|
|
adoxq %rbx, %r13
|
|
# A[0] * B[2]
|
|
mulxq (%r8), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rsp), %rdx
|
|
mulxq 8(%r8), %rdx, %rcx
|
|
adcxq %rbx, %r12
|
|
adoxq %rdx, %r11
|
|
# A[3] * B[1]
|
|
movq 8(%rsp), %rdx
|
|
adoxq %rcx, %r12
|
|
mulxq 24(%r8), %rcx, %rbx
|
|
adcxq %rcx, %r13
|
|
# A[2] * B[2]
|
|
movq 16(%rsp), %rdx
|
|
mulxq 16(%r8), %rdx, %rcx
|
|
adcxq %rbx, %r14
|
|
adoxq %rdx, %r13
|
|
# A[3] * B[3]
|
|
movq 24(%rsp), %rdx
|
|
adoxq %rcx, %r14
|
|
mulxq 24(%r8), %rcx, %rbx
|
|
adoxq %rbp, %r15
|
|
adcxq %rcx, %r15
|
|
# A[0] * B[3]
|
|
mulxq (%r8), %rdx, %rcx
|
|
adcxq %rbx, %rbp
|
|
xorq %rbx, %rbx
|
|
adcxq %rdx, %r12
|
|
# A[3] * B[0]
|
|
movq (%rsp), %rdx
|
|
adcxq %rcx, %r13
|
|
mulxq 24(%r8), %rdx, %rcx
|
|
adoxq %rdx, %r12
|
|
adoxq %rcx, %r13
|
|
# A[2] * B[3]
|
|
movq 24(%rsp), %rdx
|
|
mulxq 16(%r8), %rdx, %rcx
|
|
adcxq %rdx, %r14
|
|
# A[3] * B[2]
|
|
movq 16(%rsp), %rdx
|
|
adcxq %rcx, %r15
|
|
mulxq 24(%r8), %rcx, %rdx
|
|
adcxq %rbx, %rbp
|
|
adoxq %rcx, %r14
|
|
adoxq %rdx, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rbx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rbx, %rbx
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rcx, %r15
|
|
adcxq %rcx, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rbx, %rdx
|
|
adcxq %rbx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rbx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, 32(%rsp)
|
|
movq %r10, 40(%rsp)
|
|
movq %r11, 48(%rsp)
|
|
movq %r12, 56(%rsp)
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq 96(%rsp), %rdx
|
|
mulxq 128(%rsp), %r9, %r10
|
|
# A[2] * B[0]
|
|
mulxq 144(%rsp), %r11, %r12
|
|
# A[1] * B[0]
|
|
mulxq 136(%rsp), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[3]
|
|
movq 120(%rsp), %rdx
|
|
mulxq 136(%rsp), %r13, %r14
|
|
adcxq %rbx, %r11
|
|
# A[0] * B[1]
|
|
movq 104(%rsp), %rdx
|
|
mulxq 128(%rsp), %rcx, %rbx
|
|
adoxq %rcx, %r10
|
|
# A[2] * B[1]
|
|
mulxq 144(%rsp), %rcx, %r15
|
|
adoxq %rbx, %r11
|
|
adcxq %rcx, %r12
|
|
# A[1] * B[2]
|
|
movq 112(%rsp), %rdx
|
|
mulxq 136(%rsp), %rcx, %rbx
|
|
adcxq %r15, %r13
|
|
adoxq %rcx, %r12
|
|
adcxq %rbp, %r14
|
|
adoxq %rbx, %r13
|
|
# A[0] * B[2]
|
|
mulxq 128(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[1]
|
|
movq 104(%rsp), %rdx
|
|
mulxq 136(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r12
|
|
adoxq %rdx, %r11
|
|
# A[3] * B[1]
|
|
movq 104(%rsp), %rdx
|
|
adoxq %rcx, %r12
|
|
mulxq 152(%rsp), %rcx, %rbx
|
|
adcxq %rcx, %r13
|
|
# A[2] * B[2]
|
|
movq 112(%rsp), %rdx
|
|
mulxq 144(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %r14
|
|
adoxq %rdx, %r13
|
|
# A[3] * B[3]
|
|
movq 120(%rsp), %rdx
|
|
adoxq %rcx, %r14
|
|
mulxq 152(%rsp), %rcx, %rbx
|
|
adoxq %rbp, %r15
|
|
adcxq %rcx, %r15
|
|
# A[0] * B[3]
|
|
mulxq 128(%rsp), %rdx, %rcx
|
|
adcxq %rbx, %rbp
|
|
xorq %rbx, %rbx
|
|
adcxq %rdx, %r12
|
|
# A[3] * B[0]
|
|
movq 96(%rsp), %rdx
|
|
adcxq %rcx, %r13
|
|
mulxq 152(%rsp), %rdx, %rcx
|
|
adoxq %rdx, %r12
|
|
adoxq %rcx, %r13
|
|
# A[2] * B[3]
|
|
movq 120(%rsp), %rdx
|
|
mulxq 144(%rsp), %rdx, %rcx
|
|
adcxq %rdx, %r14
|
|
# A[3] * B[2]
|
|
movq 112(%rsp), %rdx
|
|
adcxq %rcx, %r15
|
|
mulxq 152(%rsp), %rcx, %rdx
|
|
adcxq %rbx, %rbp
|
|
adoxq %rcx, %r14
|
|
adoxq %rdx, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rbx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rbx, %rbx
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rcx, %r15
|
|
adcxq %rcx, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rbx, %rdx
|
|
adcxq %rbx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rbx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, (%rsp)
|
|
movq %r10, 8(%rsp)
|
|
movq %r11, 16(%rsp)
|
|
movq %r12, 24(%rsp)
|
|
decb 168(%rsp)
|
|
jge L_curve25519_avx2_bits
|
|
movq $63, 168(%rsp)
|
|
decb 160(%rsp)
|
|
jge L_curve25519_avx2_words
|
|
# Invert
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 96(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
movq $19, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
leaq 96(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $9, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 128(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
movq $0x63, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 128(%rsp), %rsi
|
|
leaq 96(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 96(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
movq $49, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 96(%rsp), %rsi
|
|
leaq 64(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movq $4, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
movq 176(%rsp), %rdi
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rsp), %rdx
|
|
mulxq (%rdi), %r9, %r10
|
|
# A[2] * B[0]
|
|
mulxq 16(%rdi), %r11, %r12
|
|
# A[1] * B[0]
|
|
mulxq 8(%rdi), %rcx, %rbx
|
|
xorq %rbp, %rbp
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[3]
|
|
movq 24(%rsp), %rdx
|
|
mulxq 8(%rdi), %r13, %r14
|
|
adcxq %rbx, %r11
|
|
# A[0] * B[1]
|
|
movq 8(%rsp), %rdx
|
|
mulxq (%rdi), %rcx, %rbx
|
|
adoxq %rcx, %r10
|
|
# A[2] * B[1]
|
|
mulxq 16(%rdi), %rcx, %r15
|
|
adoxq %rbx, %r11
|
|
adcxq %rcx, %r12
|
|
# A[1] * B[2]
|
|
movq 16(%rsp), %rdx
|
|
mulxq 8(%rdi), %rcx, %rbx
|
|
adcxq %r15, %r13
|
|
adoxq %rcx, %r12
|
|
adcxq %rbp, %r14
|
|
adoxq %rbx, %r13
|
|
# A[0] * B[2]
|
|
mulxq (%rdi), %rcx, %rbx
|
|
adoxq %rbp, %r14
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[1]
|
|
movq 8(%rsp), %rdx
|
|
mulxq 8(%rdi), %rdx, %rcx
|
|
adcxq %rbx, %r12
|
|
adoxq %rdx, %r11
|
|
# A[3] * B[1]
|
|
movq 8(%rsp), %rdx
|
|
adoxq %rcx, %r12
|
|
mulxq 24(%rdi), %rcx, %rbx
|
|
adcxq %rcx, %r13
|
|
# A[2] * B[2]
|
|
movq 16(%rsp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rbx, %r14
|
|
adoxq %rdx, %r13
|
|
# A[3] * B[3]
|
|
movq 24(%rsp), %rdx
|
|
adoxq %rcx, %r14
|
|
mulxq 24(%rdi), %rcx, %rbx
|
|
adoxq %rbp, %r15
|
|
adcxq %rcx, %r15
|
|
# A[0] * B[3]
|
|
mulxq (%rdi), %rdx, %rcx
|
|
adcxq %rbx, %rbp
|
|
xorq %rbx, %rbx
|
|
adcxq %rdx, %r12
|
|
# A[3] * B[0]
|
|
movq (%rsp), %rdx
|
|
adcxq %rcx, %r13
|
|
mulxq 24(%rdi), %rdx, %rcx
|
|
adoxq %rdx, %r12
|
|
adoxq %rcx, %r13
|
|
# A[2] * B[3]
|
|
movq 24(%rsp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rdx, %r14
|
|
# A[3] * B[2]
|
|
movq 16(%rsp), %rdx
|
|
adcxq %rcx, %r15
|
|
mulxq 24(%rdi), %rcx, %rdx
|
|
adcxq %rbx, %rbp
|
|
adoxq %rcx, %r14
|
|
adoxq %rdx, %r15
|
|
adoxq %rbx, %rbp
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rbx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r15, %rbp
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
andq %rbx, %r12
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rbx, %rbx
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %rcx, %r15
|
|
adcxq %rcx, %r11
|
|
adoxq %r15, %r12
|
|
mulxq %rbp, %rbp, %rdx
|
|
adcxq %rbp, %r12
|
|
adoxq %rbx, %rdx
|
|
adcxq %rbx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r12, %rdx
|
|
movq $0x7fffffffffffffff, %rbx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Reduce if top bit set
|
|
movq %r12, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rbx, %r12
|
|
addq %rcx, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
adcq $0x00, %r12
|
|
# Store
|
|
movq %r9, (%rdi)
|
|
movq %r10, 8(%rdi)
|
|
movq %r11, 16(%rdi)
|
|
movq %r12, 24(%rdi)
|
|
xorq %rax, %rax
|
|
addq $0xc0, %rsp
|
|
popq %rbp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size curve25519_avx2,.-curve25519_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_pow22523_avx2
|
|
.type fe_pow22523_avx2,@function
|
|
.align 4
|
|
fe_pow22523_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_pow22523_avx2
|
|
.p2align 2
|
|
_fe_pow22523_avx2:
|
|
#endif /* __APPLE__ */
|
|
subq $0x70, %rsp
|
|
# pow22523
|
|
movq %rdi, 96(%rsp)
|
|
movq %rsi, 104(%rsp)
|
|
movq %rsp, %rdi
|
|
movq 104(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq 104(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movb $4, %dl
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movb $9, %dl
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movb $19, %dl
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movb $9, %dl
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movb $49, %dl
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 64(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
movb $0x63, %dl
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 64(%rsp), %rsi
|
|
leaq 32(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
leaq 32(%rsp), %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movb $49, %dl
|
|
#ifndef __APPLE__
|
|
callq fe_sq_n_avx2@plt
|
|
#else
|
|
callq _fe_sq_n_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
leaq 32(%rsp), %rsi
|
|
movq %rsp, %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
movq %rsp, %rdi
|
|
movq %rsp, %rsi
|
|
#ifndef __APPLE__
|
|
callq fe_sq_avx2@plt
|
|
#else
|
|
callq _fe_sq_avx2
|
|
#endif /* __APPLE__ */
|
|
movq 96(%rsp), %rdi
|
|
movq %rsp, %rsi
|
|
movq 104(%rsp), %rdx
|
|
#ifndef __APPLE__
|
|
callq fe_mul_avx2@plt
|
|
#else
|
|
callq _fe_mul_avx2
|
|
#endif /* __APPLE__ */
|
|
movq 104(%rsp), %rsi
|
|
movq 96(%rsp), %rdi
|
|
addq $0x70, %rsp
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_to_p2_avx2
|
|
.type fe_ge_to_p2_avx2,@function
|
|
.align 4
|
|
fe_ge_to_p2_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_to_p2_avx2
|
|
.p2align 2
|
|
_fe_ge_to_p2_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $40, %rsp
|
|
movq %rsi, (%rsp)
|
|
movq %rdx, 8(%rsp)
|
|
movq %rcx, 16(%rsp)
|
|
movq %r8, 24(%rsp)
|
|
movq %r9, 32(%rsp)
|
|
movq 16(%rsp), %rsi
|
|
movq 88(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 24(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 88(%rsp), %rsi
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rsi), %rdx
|
|
mulxq (%rbx), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rbx), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq 8(%rbx), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rbx), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq 8(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rsi), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rsi), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rbx), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rsi), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rbx), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rsi), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rbx), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $40, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_to_p3_avx2
|
|
.type fe_ge_to_p3_avx2,@function
|
|
.align 4
|
|
fe_ge_to_p3_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_to_p3_avx2
|
|
.p2align 2
|
|
_fe_ge_to_p3_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $40, %rsp
|
|
movq %rsi, (%rsp)
|
|
movq %rdx, 8(%rsp)
|
|
movq %rcx, 16(%rsp)
|
|
movq %r8, 24(%rsp)
|
|
movq %r9, 32(%rsp)
|
|
movq 24(%rsp), %rsi
|
|
movq 96(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq (%rsp), %rdi
|
|
movq 32(%rsp), %rsi
|
|
movq 88(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq 96(%rsp), %rsi
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rsi), %rdx
|
|
mulxq (%rbx), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rbx), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq 8(%rbx), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rbx), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq 8(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rsi), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rsi), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rbx), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rsi), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rbx), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rsi), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rbx), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq 24(%rsp), %rsi
|
|
movq 32(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
addq $40, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_dbl_avx2
|
|
.type fe_ge_dbl_avx2,@function
|
|
.align 4
|
|
fe_ge_dbl_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_dbl_avx2
|
|
.p2align 2
|
|
_fe_ge_dbl_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbp
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $48, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq 32(%rsp), %rsi
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rsi), %rdx
|
|
mulxq 8(%rsi), %r9, %r10
|
|
# A[0] * A[3]
|
|
mulxq 24(%rsi), %r11, %r12
|
|
# A[2] * A[1]
|
|
movq 16(%rsi), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[3]
|
|
mulxq 24(%rsi), %r13, %r14
|
|
adoxq %rax, %r12
|
|
# A[2] * A[0]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
adcxq %rcx, %r10
|
|
adoxq %r15, %r14
|
|
# A[1] * A[3]
|
|
movq 8(%rsi), %rdx
|
|
mulxq 24(%rsi), %rbp, %r8
|
|
adcxq %rax, %r11
|
|
adcxq %rbp, %r12
|
|
adcxq %r8, %r13
|
|
adcxq %r15, %r14
|
|
# Double with Carry Flag
|
|
xorq %r15, %r15
|
|
# A[0] * A[0]
|
|
movq (%rsi), %rdx
|
|
mulxq %rdx, %r8, %rbp
|
|
adcxq %r9, %r9
|
|
# A[1] * A[1]
|
|
movq 8(%rsi), %rdx
|
|
mulxq %rdx, %rcx, %rax
|
|
adcxq %r10, %r10
|
|
adoxq %rbp, %r9
|
|
adcxq %r11, %r11
|
|
adoxq %rcx, %r10
|
|
# A[2] * A[2]
|
|
movq 16(%rsi), %rdx
|
|
mulxq %rdx, %rbp, %rcx
|
|
adcxq %r12, %r12
|
|
adoxq %rax, %r11
|
|
adcxq %r13, %r13
|
|
adoxq %rbp, %r12
|
|
# A[3] * A[3]
|
|
movq 24(%rsi), %rdx
|
|
mulxq %rdx, %rbp, %rax
|
|
adcxq %r14, %r14
|
|
adoxq %rcx, %r13
|
|
adcxq %r15, %r15
|
|
adoxq %rbp, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r12, %rbp, %r12
|
|
adcxq %rbp, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rbp, %r13
|
|
adcxq %rbp, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rbp, %r14
|
|
adcxq %rbp, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rbp
|
|
andq %rcx, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rbp
|
|
andq %rcx, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 16(%rsp), %rdi
|
|
movq 40(%rsp), %rbx
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rbx), %rdx
|
|
mulxq 8(%rbx), %r9, %r10
|
|
# A[0] * A[3]
|
|
mulxq 24(%rbx), %r11, %r12
|
|
# A[2] * A[1]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[3]
|
|
mulxq 24(%rbx), %r13, %r14
|
|
adoxq %rax, %r12
|
|
# A[2] * A[0]
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
adcxq %rcx, %r10
|
|
adoxq %r15, %r14
|
|
# A[1] * A[3]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 24(%rbx), %rbp, %r8
|
|
adcxq %rax, %r11
|
|
adcxq %rbp, %r12
|
|
adcxq %r8, %r13
|
|
adcxq %r15, %r14
|
|
# Double with Carry Flag
|
|
xorq %r15, %r15
|
|
# A[0] * A[0]
|
|
movq (%rbx), %rdx
|
|
mulxq %rdx, %r8, %rbp
|
|
adcxq %r9, %r9
|
|
# A[1] * A[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq %rdx, %rcx, %rax
|
|
adcxq %r10, %r10
|
|
adoxq %rbp, %r9
|
|
adcxq %r11, %r11
|
|
adoxq %rcx, %r10
|
|
# A[2] * A[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq %rdx, %rbp, %rcx
|
|
adcxq %r12, %r12
|
|
adoxq %rax, %r11
|
|
adcxq %r13, %r13
|
|
adoxq %rbp, %r12
|
|
# A[3] * A[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq %rdx, %rbp, %rax
|
|
adcxq %r14, %r14
|
|
adoxq %rcx, %r13
|
|
adcxq %r15, %r15
|
|
adoxq %rbp, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r12, %rbp, %r12
|
|
adcxq %rbp, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rbp, %r13
|
|
adcxq %rbp, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rbp, %r14
|
|
adcxq %rbp, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rbp
|
|
andq %rcx, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rbp
|
|
andq %rcx, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq (%rbx), %r8
|
|
movq 16(%rsi), %r10
|
|
adcq 8(%rbx), %r9
|
|
movq 24(%rsi), %rdx
|
|
adcq 16(%rbx), %r10
|
|
movq $-19, %rcx
|
|
adcq 24(%rbx), %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
movq %rdx, %r11
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 24(%rsp), %rsi
|
|
# Square
|
|
# A[0] * A[1]
|
|
movq (%rdi), %rdx
|
|
mulxq 8(%rdi), %r9, %r10
|
|
# A[0] * A[3]
|
|
mulxq 24(%rdi), %r11, %r12
|
|
# A[2] * A[1]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[3]
|
|
mulxq 24(%rdi), %r13, %r14
|
|
adoxq %rax, %r12
|
|
# A[2] * A[0]
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
adcxq %rcx, %r10
|
|
adoxq %r15, %r14
|
|
# A[1] * A[3]
|
|
movq 8(%rdi), %rdx
|
|
mulxq 24(%rdi), %rbp, %r8
|
|
adcxq %rax, %r11
|
|
adcxq %rbp, %r12
|
|
adcxq %r8, %r13
|
|
adcxq %r15, %r14
|
|
# Double with Carry Flag
|
|
xorq %r15, %r15
|
|
# A[0] * A[0]
|
|
movq (%rdi), %rdx
|
|
mulxq %rdx, %r8, %rbp
|
|
adcxq %r9, %r9
|
|
# A[1] * A[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq %rdx, %rcx, %rax
|
|
adcxq %r10, %r10
|
|
adoxq %rbp, %r9
|
|
adcxq %r11, %r11
|
|
adoxq %rcx, %r10
|
|
# A[2] * A[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq %rdx, %rbp, %rcx
|
|
adcxq %r12, %r12
|
|
adoxq %rax, %r11
|
|
adcxq %r13, %r13
|
|
adoxq %rbp, %r12
|
|
# A[3] * A[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq %rdx, %rbp, %rax
|
|
adcxq %r14, %r14
|
|
adoxq %rcx, %r13
|
|
adcxq %r15, %r15
|
|
adoxq %rbp, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rcx
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rcx, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rcx, %rcx
|
|
mulxq %r12, %rbp, %r12
|
|
adcxq %rbp, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rbp, %r13
|
|
adcxq %rbp, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rbp, %r14
|
|
adcxq %rbp, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rcx, %rdx
|
|
adcxq %rcx, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rcx
|
|
imulq $19, %rdx, %rbp
|
|
andq %rcx, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rbp
|
|
andq %rcx, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 16(%rsp), %rsi
|
|
movq (%rsp), %rbx
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %rdx
|
|
movq %r8, %r12
|
|
addq (%rbx), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rbx), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rbx), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rbx), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rbx), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rbx), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rbx), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rbx), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rsi)
|
|
movq %r13, 8(%rsi)
|
|
movq %r14, 16(%rsi)
|
|
movq %r15, 24(%rsi)
|
|
movq 24(%rsp), %rsi
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rdi), %r8
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rdi), %r9
|
|
movq $-19, %rcx
|
|
sbbq 16(%rdi), %r10
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rdi), %r11
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r8
|
|
adcq %rdx, %r9
|
|
adcq %rdx, %r10
|
|
adcq %rax, %r11
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq 104(%rsp), %rdi
|
|
# Square * 2
|
|
# A[0] * A[1]
|
|
movq (%rdi), %rdx
|
|
mulxq 8(%rdi), %r9, %r10
|
|
# A[0] * A[3]
|
|
mulxq 24(%rdi), %r11, %r12
|
|
# A[2] * A[1]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adoxq %rcx, %r11
|
|
# A[2] * A[3]
|
|
mulxq 24(%rdi), %r13, %r14
|
|
adoxq %rax, %r12
|
|
# A[2] * A[0]
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
adcxq %rcx, %r10
|
|
adoxq %r15, %r14
|
|
# A[1] * A[3]
|
|
movq 8(%rdi), %rdx
|
|
mulxq 24(%rdi), %rbp, %r8
|
|
adcxq %rax, %r11
|
|
adcxq %rbp, %r12
|
|
adcxq %r8, %r13
|
|
adcxq %r15, %r14
|
|
# Double with Carry Flag
|
|
xorq %r15, %r15
|
|
# A[0] * A[0]
|
|
movq (%rdi), %rdx
|
|
mulxq %rdx, %r8, %rbp
|
|
adcxq %r9, %r9
|
|
# A[1] * A[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq %rdx, %rcx, %rax
|
|
adcxq %r10, %r10
|
|
adoxq %rbp, %r9
|
|
adcxq %r11, %r11
|
|
adoxq %rcx, %r10
|
|
# A[2] * A[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq %rdx, %rbp, %rcx
|
|
adcxq %r12, %r12
|
|
adoxq %rax, %r11
|
|
adcxq %r13, %r13
|
|
adoxq %rbp, %r12
|
|
# A[3] * A[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq %rdx, %rbp, %rax
|
|
adcxq %r14, %r14
|
|
adoxq %rcx, %r13
|
|
adcxq %r15, %r15
|
|
adoxq %rbp, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
xorq %rbp, %rbp
|
|
# Move top half into t4-t7 and remove top bit from t3 and double
|
|
shldq $3, %r15, %rbp
|
|
shldq $2, %r14, %r15
|
|
shldq $2, %r13, %r14
|
|
shldq $2, %r12, %r13
|
|
shldq $2, %r11, %r12
|
|
shldq $0x01, %r10, %r11
|
|
shldq $0x01, %r9, %r10
|
|
shldq $0x01, %r8, %r9
|
|
shlq $0x01, %r8
|
|
andq %rax, %r11
|
|
# Two out left, one in right
|
|
andq %rax, %r15
|
|
# Multiply top bits by 19*19
|
|
imulq $0x169, %rbp, %rcx
|
|
xorq %rax, %rax
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
adoxq %rcx, %r8
|
|
mulxq %r12, %rbp, %r12
|
|
adcxq %rbp, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rbp, %r13
|
|
adcxq %rbp, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rbp, %r14
|
|
adcxq %rbp, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rbp
|
|
andq %rax, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rbp
|
|
andq %rax, %r11
|
|
addq %rbp, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 16(%rsp), %rdi
|
|
# Sub
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %r11
|
|
subq (%rdi), %r8
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rdi), %r9
|
|
movq $-19, %rcx
|
|
sbbq 16(%rdi), %r10
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rdi), %r11
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r8
|
|
adcq %rdx, %r9
|
|
adcq %rdx, %r10
|
|
adcq %rax, %r11
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
addq $48, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
popq %rbp
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_madd_avx2
|
|
.type fe_ge_madd_avx2,@function
|
|
.align 4
|
|
fe_ge_madd_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_madd_avx2
|
|
.p2align 2
|
|
_fe_ge_madd_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbp
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $48, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq 8(%rsp), %rsi
|
|
movq 40(%rsp), %rbx
|
|
movq 32(%rsp), %rbp
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rbp), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rbp), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rbp), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rbp), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rbp), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rbp), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rbp), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rbp), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rsi)
|
|
movq %r13, 8(%rsi)
|
|
movq %r14, 16(%rsi)
|
|
movq %r15, 24(%rsi)
|
|
movq 16(%rsp), %rbx
|
|
movq 128(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rdi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rdi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rdi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rdi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rdi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rdi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rdi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq 136(%rsp), %rdi
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rdi), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rdi), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 24(%rsp), %rdi
|
|
movq 120(%rsp), %rsi
|
|
movq 112(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rdi
|
|
movq (%rsp), %rsi
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rdi), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rdi), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rdi), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rdi), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rdi), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rdi), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rdi), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rdi), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rsi)
|
|
movq %r13, 8(%rsi)
|
|
movq %r14, 16(%rsi)
|
|
movq %r15, 24(%rsi)
|
|
movq 104(%rsp), %rdi
|
|
# Double
|
|
movq (%rdi), %r8
|
|
movq 8(%rdi), %r9
|
|
addq %r8, %r8
|
|
movq 16(%rdi), %r10
|
|
adcq %r9, %r9
|
|
movq 24(%rdi), %rdx
|
|
adcq %r10, %r10
|
|
movq $-19, %rcx
|
|
adcq %rdx, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
movq %rdx, %r11
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq 24(%rsp), %rdi
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rdi), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rdi), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rdi), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rdi), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rdi), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rdi), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rdi), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rdi), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq %r12, (%rdi)
|
|
movq %r13, 8(%rdi)
|
|
movq %r14, 16(%rdi)
|
|
movq %r15, 24(%rdi)
|
|
addq $48, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
popq %rbp
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_madd_avx2,.-fe_ge_madd_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_msub_avx2
|
|
.type fe_ge_msub_avx2,@function
|
|
.align 4
|
|
fe_ge_msub_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_msub_avx2
|
|
.p2align 2
|
|
_fe_ge_msub_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbp
|
|
pushq %rbx
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $48, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq 8(%rsp), %rsi
|
|
movq 40(%rsp), %rbx
|
|
movq 32(%rsp), %rbp
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rbp), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rbp), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rbp), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rbp), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rbp), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rbp), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rbp), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rbp), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rsi)
|
|
movq %r13, 8(%rsi)
|
|
movq %r14, 16(%rsi)
|
|
movq %r15, 24(%rsi)
|
|
movq 16(%rsp), %rbx
|
|
movq 136(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rdi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rdi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rdi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rdi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rdi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rdi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rdi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq 128(%rsp), %rdi
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rdi), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rdi), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rdi), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rdi), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rdi), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 24(%rsp), %rdi
|
|
movq 120(%rsp), %rsi
|
|
movq 112(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq 8(%rsp), %rsi
|
|
movq (%rsp), %rbp
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rsi), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rsi), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rsi), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rsi), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rsi), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rsi), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rsi), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rsi), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq %r12, (%rbp)
|
|
movq %r13, 8(%rbp)
|
|
movq %r14, 16(%rbp)
|
|
movq %r15, 24(%rbp)
|
|
movq 104(%rsp), %rsi
|
|
# Double
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
addq %r8, %r8
|
|
movq 16(%rsi), %r10
|
|
adcq %r9, %r9
|
|
movq 24(%rsi), %rdx
|
|
adcq %r10, %r10
|
|
movq $-19, %rcx
|
|
adcq %rdx, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
movq %rdx, %r11
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rdi), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rdi), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rdi), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rdi), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rdi), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rdi), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rdi), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rdi), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rbx)
|
|
movq %r13, 8(%rbx)
|
|
movq %r14, 16(%rbx)
|
|
movq %r15, 24(%rbx)
|
|
addq $48, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbx
|
|
popq %rbp
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_msub_avx2,.-fe_ge_msub_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_add_avx2
|
|
.type fe_ge_add_avx2,@function
|
|
.align 4
|
|
fe_ge_add_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_add_avx2
|
|
.p2align 2
|
|
_fe_ge_add_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %rbp
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $0x50, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq 8(%rsp), %rsi
|
|
movq 40(%rsp), %rbx
|
|
movq 32(%rsp), %rbp
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rbp), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rbp), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rbp), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rbp), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rbp), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rbp), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rbp), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rbp), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rsi)
|
|
movq %r13, 8(%rsi)
|
|
movq %r14, 16(%rsi)
|
|
movq %r15, 24(%rsi)
|
|
movq 16(%rsp), %rbx
|
|
movq 168(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rdi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rdi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rdi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rdi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rdi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rdi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rdi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq 176(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 24(%rsp), %rsi
|
|
movq 160(%rsp), %rbx
|
|
movq 144(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rbx), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rbx), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rbx), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rbx), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rbx), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rbx), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rbx), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 136(%rsp), %rsi
|
|
movq 152(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
leaq 48(%rsp), %rsi
|
|
# Double
|
|
movq (%rdi), %r8
|
|
movq 8(%rdi), %r9
|
|
addq %r8, %r8
|
|
movq 16(%rdi), %r10
|
|
adcq %r9, %r9
|
|
movq 24(%rdi), %rdx
|
|
adcq %r10, %r10
|
|
movq $-19, %rcx
|
|
adcq %rdx, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
movq %rdx, %r11
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 8(%rsp), %rbx
|
|
movq 16(%rsp), %rbp
|
|
# Add
|
|
movq (%rbp), %r8
|
|
movq 8(%rbp), %r9
|
|
movq 16(%rbp), %r10
|
|
movq 24(%rbp), %rdx
|
|
movq %r8, %r12
|
|
addq (%rbx), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rbx), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rbx), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rbx), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rbx), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rbx), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rbx), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rbx), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq %r12, (%rdi)
|
|
movq %r13, 8(%rdi)
|
|
movq %r14, 16(%rdi)
|
|
movq %r15, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %rdx
|
|
movq %r8, %r12
|
|
addq (%rdi), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rdi), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rdi), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rdi), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rdi), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rdi), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rdi), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rdi), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rbp)
|
|
movq %r9, 8(%rbp)
|
|
movq %r10, 16(%rbp)
|
|
movq %r11, 24(%rbp)
|
|
movq %r12, (%rdi)
|
|
movq %r13, 8(%rdi)
|
|
movq %r14, 16(%rdi)
|
|
movq %r15, 24(%rdi)
|
|
addq $0x50, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbp
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_add_avx2,.-fe_ge_add_avx2
|
|
#endif /* __APPLE__ */
|
|
#ifndef __APPLE__
|
|
.text
|
|
.globl fe_ge_sub_avx2
|
|
.type fe_ge_sub_avx2,@function
|
|
.align 4
|
|
fe_ge_sub_avx2:
|
|
#else
|
|
.section __TEXT,__text
|
|
.globl _fe_ge_sub_avx2
|
|
.p2align 2
|
|
_fe_ge_sub_avx2:
|
|
#endif /* __APPLE__ */
|
|
pushq %rbx
|
|
pushq %rbp
|
|
pushq %r12
|
|
pushq %r13
|
|
pushq %r14
|
|
pushq %r15
|
|
subq $0x50, %rsp
|
|
movq %rdi, (%rsp)
|
|
movq %rsi, 8(%rsp)
|
|
movq %rdx, 16(%rsp)
|
|
movq %rcx, 24(%rsp)
|
|
movq %r8, 32(%rsp)
|
|
movq %r9, 40(%rsp)
|
|
movq 8(%rsp), %rsi
|
|
movq 40(%rsp), %rbx
|
|
movq 32(%rsp), %rbp
|
|
# Add
|
|
movq (%rbx), %r8
|
|
movq 8(%rbx), %r9
|
|
movq 16(%rbx), %r10
|
|
movq 24(%rbx), %rdx
|
|
movq %r8, %r12
|
|
addq (%rbp), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rbp), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rbp), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rbp), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rbp), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rbp), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rbp), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rbp), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rsi)
|
|
movq %r13, 8(%rsi)
|
|
movq %r14, 16(%rsi)
|
|
movq %r15, 24(%rsi)
|
|
movq 16(%rsp), %rbx
|
|
movq 176(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rdi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rdi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rdi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rdi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rdi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rdi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rdi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rdi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rdi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rdi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rdi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq 168(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 24(%rsp), %rsi
|
|
movq 160(%rsp), %rbx
|
|
movq 144(%rsp), %rbp
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbp), %rdx
|
|
mulxq (%rbx), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rbx), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 8(%rbx), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rbx), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 8(%rbx), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rbx), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
mulxq 8(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbp), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rbx), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rbx), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbp), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rbx), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbp), %rdx
|
|
mulxq 16(%rbx), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbp), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rbx), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 136(%rsp), %rsi
|
|
movq 152(%rsp), %rbx
|
|
# Multiply
|
|
# A[0] * B[0]
|
|
movq (%rbx), %rdx
|
|
mulxq (%rsi), %r8, %r9
|
|
# A[2] * B[0]
|
|
mulxq 16(%rsi), %r10, %r11
|
|
# A[1] * B[0]
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
xorq %r15, %r15
|
|
adcxq %rcx, %r9
|
|
# A[1] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 8(%rsi), %r12, %r13
|
|
adcxq %rax, %r10
|
|
# A[0] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %rcx, %r9
|
|
# A[2] * B[1]
|
|
mulxq 16(%rsi), %rcx, %r14
|
|
adoxq %rax, %r10
|
|
adcxq %rcx, %r11
|
|
# A[1] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 8(%rsi), %rcx, %rax
|
|
adcxq %r14, %r12
|
|
adoxq %rcx, %r11
|
|
adcxq %r15, %r13
|
|
adoxq %rax, %r12
|
|
# A[0] * B[2]
|
|
mulxq (%rsi), %rcx, %rax
|
|
adoxq %r15, %r13
|
|
xorq %r14, %r14
|
|
adcxq %rcx, %r10
|
|
# A[1] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
mulxq 8(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r11
|
|
adoxq %rdx, %r10
|
|
# A[3] * B[1]
|
|
movq 8(%rbx), %rdx
|
|
adoxq %rcx, %r11
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adcxq %rcx, %r12
|
|
# A[2] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rax, %r13
|
|
adoxq %rdx, %r12
|
|
# A[3] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
adoxq %rcx, %r13
|
|
mulxq 24(%rsi), %rcx, %rax
|
|
adoxq %r15, %r14
|
|
adcxq %rcx, %r14
|
|
# A[0] * B[3]
|
|
mulxq (%rsi), %rdx, %rcx
|
|
adcxq %rax, %r15
|
|
xorq %rax, %rax
|
|
adcxq %rdx, %r11
|
|
# A[3] * B[0]
|
|
movq (%rbx), %rdx
|
|
adcxq %rcx, %r12
|
|
mulxq 24(%rsi), %rdx, %rcx
|
|
adoxq %rdx, %r11
|
|
adoxq %rcx, %r12
|
|
# A[2] * B[3]
|
|
movq 24(%rbx), %rdx
|
|
mulxq 16(%rsi), %rdx, %rcx
|
|
adcxq %rdx, %r13
|
|
# A[3] * B[2]
|
|
movq 16(%rbx), %rdx
|
|
adcxq %rcx, %r14
|
|
mulxq 24(%rsi), %rcx, %rdx
|
|
adcxq %rax, %r15
|
|
adoxq %rcx, %r13
|
|
adoxq %rdx, %r14
|
|
adoxq %rax, %r15
|
|
# Reduce
|
|
movq $0x7fffffffffffffff, %rax
|
|
# Move top half into t4-t7 and remove top bit from t3
|
|
shldq $0x01, %r14, %r15
|
|
shldq $0x01, %r13, %r14
|
|
shldq $0x01, %r12, %r13
|
|
shldq $0x01, %r11, %r12
|
|
andq %rax, %r11
|
|
# Multiply top half by 19
|
|
movq $19, %rdx
|
|
xorq %rax, %rax
|
|
mulxq %r12, %rcx, %r12
|
|
adcxq %rcx, %r8
|
|
adoxq %r12, %r9
|
|
mulxq %r13, %rcx, %r13
|
|
adcxq %rcx, %r9
|
|
adoxq %r13, %r10
|
|
mulxq %r14, %rcx, %r14
|
|
adcxq %rcx, %r10
|
|
adoxq %r14, %r11
|
|
mulxq %r15, %r15, %rdx
|
|
adcxq %r15, %r11
|
|
adoxq %rax, %rdx
|
|
adcxq %rax, %rdx
|
|
# Overflow
|
|
shldq $0x01, %r11, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Reduce if top bit set
|
|
movq %r11, %rdx
|
|
shrq $63, %rdx
|
|
imulq $19, %rdx, %rcx
|
|
andq %rax, %r11
|
|
addq %rcx, %r8
|
|
adcq $0x00, %r9
|
|
adcq $0x00, %r10
|
|
adcq $0x00, %r11
|
|
# Store
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
leaq 48(%rsp), %rsi
|
|
# Double
|
|
movq (%rdi), %r8
|
|
movq 8(%rdi), %r9
|
|
addq %r8, %r8
|
|
movq 16(%rdi), %r10
|
|
adcq %r9, %r9
|
|
movq 24(%rdi), %rdx
|
|
adcq %r10, %r10
|
|
movq $-19, %rcx
|
|
adcq %rdx, %rdx
|
|
movq $0x7fffffffffffffff, %rax
|
|
movq %rdx, %r11
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
movq %r8, (%rsi)
|
|
movq %r9, 8(%rsi)
|
|
movq %r10, 16(%rsi)
|
|
movq %r11, 24(%rsi)
|
|
movq 8(%rsp), %rbx
|
|
movq 16(%rsp), %rbp
|
|
# Add
|
|
movq (%rbp), %r8
|
|
movq 8(%rbp), %r9
|
|
movq 16(%rbp), %r10
|
|
movq 24(%rbp), %rdx
|
|
movq %r8, %r12
|
|
addq (%rbx), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rbx), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rbx), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rbx), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rbx), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rbx), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rbx), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rbx), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rbx)
|
|
movq %r9, 8(%rbx)
|
|
movq %r10, 16(%rbx)
|
|
movq %r11, 24(%rbx)
|
|
movq %r12, (%rdi)
|
|
movq %r13, 8(%rdi)
|
|
movq %r14, 16(%rdi)
|
|
movq %r15, 24(%rdi)
|
|
movq 24(%rsp), %rdi
|
|
# Add
|
|
movq (%rsi), %r8
|
|
movq 8(%rsi), %r9
|
|
movq 16(%rsi), %r10
|
|
movq 24(%rsi), %rdx
|
|
movq %r8, %r12
|
|
addq (%rdi), %r8
|
|
movq %r9, %r13
|
|
adcq 8(%rdi), %r9
|
|
movq %r10, %r14
|
|
adcq 16(%rdi), %r10
|
|
movq %rdx, %r15
|
|
adcq 24(%rdi), %rdx
|
|
movq $-19, %rcx
|
|
movq %rdx, %r11
|
|
movq $0x7fffffffffffffff, %rax
|
|
sarq $63, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Sub modulus (if overflow)
|
|
subq %rcx, %r8
|
|
sbbq %rdx, %r9
|
|
sbbq %rdx, %r10
|
|
sbbq %rax, %r11
|
|
# Sub
|
|
subq (%rdi), %r12
|
|
movq $0x00, %rdx
|
|
sbbq 8(%rdi), %r13
|
|
movq $-19, %rcx
|
|
sbbq 16(%rdi), %r14
|
|
movq $0x7fffffffffffffff, %rax
|
|
sbbq 24(%rdi), %r15
|
|
sbbq $0x00, %rdx
|
|
# Mask the modulus
|
|
andq %rdx, %rcx
|
|
andq %rdx, %rax
|
|
# Add modulus (if underflow)
|
|
addq %rcx, %r12
|
|
adcq %rdx, %r13
|
|
adcq %rdx, %r14
|
|
adcq %rax, %r15
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, (%rbp)
|
|
movq %r13, 8(%rbp)
|
|
movq %r14, 16(%rbp)
|
|
movq %r15, 24(%rbp)
|
|
addq $0x50, %rsp
|
|
popq %r15
|
|
popq %r14
|
|
popq %r13
|
|
popq %r12
|
|
popq %rbp
|
|
popq %rbx
|
|
repz retq
|
|
#ifndef __APPLE__
|
|
.size fe_ge_sub_avx2,.-fe_ge_sub_avx2
|
|
#endif /* __APPLE__ */
|
|
#endif /* HAVE_INTEL_AVX2 */
|