diff options
author | Adam Langley <agl@google.com> | 2015-01-22 14:27:53 -0800 |
---|---|---|
committer | Adam Langley <agl@google.com> | 2015-01-30 16:52:14 -0800 |
commit | d9e397b599b13d642138480a28c14db7a136bf05 (patch) | |
tree | 34bab61dc4ce323b123ad4614dbc07e86ea2f9ef /mac-x86_64/crypto/bn | |
download | external_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.zip external_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.tar.gz external_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.tar.bz2 |
Initial commit of BoringSSL for Android.
Diffstat (limited to 'mac-x86_64/crypto/bn')
-rw-r--r-- | mac-x86_64/crypto/bn/modexp512-x86_64.S | 1776 | ||||
-rw-r--r-- | mac-x86_64/crypto/bn/rsaz-avx2.S | 34 | ||||
-rw-r--r-- | mac-x86_64/crypto/bn/rsaz-x86_64.S | 1126 | ||||
-rw-r--r-- | mac-x86_64/crypto/bn/x86_64-mont.S | 726 | ||||
-rw-r--r-- | mac-x86_64/crypto/bn/x86_64-mont5.S | 1822 |
5 files changed, 5484 insertions, 0 deletions
diff --git a/mac-x86_64/crypto/bn/modexp512-x86_64.S b/mac-x86_64/crypto/bn/modexp512-x86_64.S new file mode 100644 index 0000000..beb133e --- /dev/null +++ b/mac-x86_64/crypto/bn/modexp512-x86_64.S @@ -0,0 +1,1776 @@ +#if defined(__x86_64__) +.text + + +.p2align 4 +MULADD_128x512: + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + movq %r8,0(%rcx) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq 8(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + movq %r9,8(%rcx) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 + + +.p2align 4 +mont_reduce: + leaq 192(%rsp),%rdi + movq 32(%rsp),%rsi + addq $576,%rsi + leaq 520(%rsp),%rcx + + movq 96(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + movq (%rcx),%r8 + addq %rax,%r8 + adcq $0,%rdx + movq %r8,0(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + movq 8(%rcx),%r9 + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + movq 16(%rcx),%r10 + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + movq 24(%rcx),%r11 + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + movq 32(%rcx),%r12 + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + movq 40(%rcx),%r13 + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + movq 48(%rcx),%r14 + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + movq 56(%rcx),%r15 + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq 104(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%r9 + movq 112(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%r10 + movq 120(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%r11 + xorq %rax,%rax + + addq 64(%rcx),%r8 + adcq 72(%rcx),%r9 + adcq 80(%rcx),%r10 + adcq 88(%rcx),%r11 + adcq $0,%rax + + + + + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,%rbp + movq %r11,88(%rdi) + + movq %rax,384(%rsp) + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + + + + + + + + addq $80,%rdi + + addq $64,%rsi + leaq 296(%rsp),%rcx + + call MULADD_128x512 + + movq 384(%rsp),%rax + + + addq -16(%rdi),%r8 + adcq -8(%rdi),%r9 + movq %r8,64(%rcx) + movq %r9,72(%rcx) + + adcq %rax,%rax + movq %rax,384(%rsp) + + leaq 192(%rsp),%rdi + addq $64,%rsi + + + + + + movq (%rsi),%r8 + movq 8(%rsi),%rbx + + movq (%rcx),%rax + mulq %r8 + movq %rax,%rbp + movq %rdx,%r9 + + movq 8(%rcx),%rax + mulq %r8 + addq %rax,%r9 + + movq (%rcx),%rax + mulq %rbx + addq %rax,%r9 + + movq %r9,8(%rdi) + + + subq $192,%rsi + + movq (%rcx),%r8 + movq 8(%rcx),%r9 + + call MULADD_128x512 + + + + + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 16(%rsi),%rdi + movq 24(%rsi),%rdx + + + movq 384(%rsp),%rbp + + addq 64(%rcx),%r8 + adcq 72(%rcx),%r9 + + + adcq %rbp,%rbp + + + + shlq $3,%rbp + movq 32(%rsp),%rcx + addq %rcx,%rbp + + + xorq %rsi,%rsi + + addq 0(%rbp),%r10 + adcq 64(%rbp),%r11 + adcq 128(%rbp),%r12 + adcq 192(%rbp),%r13 + adcq 256(%rbp),%r14 + adcq 320(%rbp),%r15 + adcq 384(%rbp),%r8 + adcq 448(%rbp),%r9 + + + + sbbq $0,%rsi + + + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rdi + andq %rsi,%rdx + + movq $1,%rbp + subq %rax,%r10 + sbbq %rbx,%r11 + sbbq %rdi,%r12 + sbbq %rdx,%r13 + + + + + sbbq $0,%rbp + + + + addq $512,%rcx + movq 32(%rcx),%rax + movq 40(%rcx),%rbx + movq 48(%rcx),%rdi + movq 56(%rcx),%rdx + + + + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rdi + andq %rsi,%rdx + + + + subq $1,%rbp + + sbbq %rax,%r14 + sbbq %rbx,%r15 + sbbq %rdi,%r8 + sbbq %rdx,%r9 + + + + movq 144(%rsp),%rsi + movq %r10,0(%rsi) + movq %r11,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq %r8,48(%rsi) + movq %r9,56(%rsi) + + .byte 0xf3,0xc3 + + +.p2align 4 +mont_mul_a3b: + + + + + movq 0(%rdi),%rbp + + movq %r10,%rax + mulq %rbp + movq %rax,520(%rsp) + movq %rdx,%r10 + movq %r11,%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r11 + movq %r12,%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %r13,%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + movq %rdx,%r13 + movq %r14,%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + movq %rdx,%r14 + movq %r15,%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r15 + movq %r8,%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq %r9,%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + movq %rdx,%r9 + movq 8(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + movq %r10,528(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%r10 + movq 16(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + movq %r11,536(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%r11 + movq 24(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + movq %r12,544(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq 32(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + movq %r13,552(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%r13 + movq 40(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + movq %r14,560(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%r14 + movq 48(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + movq %r15,568(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%r15 + movq 56(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + movq %r8,576(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq %r9,584(%rsp) + movq %r10,592(%rsp) + movq %r11,600(%rsp) + movq %r12,608(%rsp) + movq %r13,616(%rsp) + movq %r14,624(%rsp) + movq %r15,632(%rsp) + movq %r8,640(%rsp) + + + + + + jmp mont_reduce + + + + +.p2align 4 +sqr_reduce: + movq 16(%rsp),%rcx + + + + movq %r10,%rbx + + movq %r11,%rax + mulq %rbx + movq %rax,528(%rsp) + movq %rdx,%r10 + movq %r12,%rax + mulq %rbx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r11 + movq %r13,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %r14,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + movq %rdx,%r13 + movq %r15,%rax + mulq %rbx + addq %rax,%r13 + adcq $0,%rdx + movq %rdx,%r14 + movq %r8,%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r15 + movq %r9,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + movq %rdx,%rsi + + movq %r10,536(%rsp) + + + + + + movq 8(%rcx),%rbx + + movq 16(%rcx),%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + movq %r11,544(%rsp) + + movq %rdx,%r10 + movq 24(%rcx),%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %r12,552(%rsp) + + movq %rdx,%r10 + movq 32(%rcx),%rax + mulq %rbx + addq %rax,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + + movq %rdx,%r10 + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%rsi + adcq $0,%rdx + addq %r10,%rsi + adcq $0,%rdx + + movq %rdx,%r11 + + + + + movq 16(%rcx),%rbx + + movq 24(%rcx),%rax + mulq %rbx + addq %rax,%r13 + adcq $0,%rdx + movq %r13,560(%rsp) + + movq %rdx,%r10 + movq 32(%rcx),%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %r14,568(%rsp) + + movq %rdx,%r10 + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%rsi + adcq $0,%rdx + addq %r10,%rsi + adcq $0,%rdx + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + addq %r10,%r11 + adcq $0,%rdx + + movq %rdx,%r12 + + + + + + movq 24(%rcx),%rbx + + movq 32(%rcx),%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + movq %r15,576(%rsp) + + movq %rdx,%r10 + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%rsi + adcq $0,%rdx + addq %r10,%rsi + adcq $0,%rdx + movq %rsi,584(%rsp) + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + addq %r10,%r11 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + + movq %rdx,%r15 + + + + + movq 32(%rcx),%rbx + + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + movq %r11,592(%rsp) + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %r12,600(%rsp) + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + + movq %rdx,%r11 + + + + + movq 40(%rcx),%rbx + + movq %r8,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + movq %r15,608(%rsp) + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + addq %r10,%r11 + adcq $0,%rdx + movq %r11,616(%rsp) + + movq %rdx,%r12 + + + + + movq %r8,%rbx + + movq %r9,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + movq %r12,624(%rsp) + + movq %rdx,632(%rsp) + + + movq 528(%rsp),%r10 + movq 536(%rsp),%r11 + movq 544(%rsp),%r12 + movq 552(%rsp),%r13 + movq 560(%rsp),%r14 + movq 568(%rsp),%r15 + + movq 24(%rcx),%rax + mulq %rax + movq %rax,%rdi + movq %rdx,%r8 + + addq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r8 + + movq 0(%rcx),%rax + mulq %rax + movq %rax,520(%rsp) + movq %rdx,%rbx + + movq 8(%rcx),%rax + mulq %rax + + addq %rbx,%r10 + adcq %rax,%r11 + adcq $0,%rdx + + movq %rdx,%rbx + movq %r10,528(%rsp) + movq %r11,536(%rsp) + + movq 16(%rcx),%rax + mulq %rax + + addq %rbx,%r12 + adcq %rax,%r13 + adcq $0,%rdx + + movq %rdx,%rbx + + movq %r12,544(%rsp) + movq %r13,552(%rsp) + + xorq %rbp,%rbp + addq %rbx,%r14 + adcq %rdi,%r15 + adcq $0,%rbp + + movq %r14,560(%rsp) + movq %r15,568(%rsp) + + + + + movq 576(%rsp),%r10 + movq 584(%rsp),%r11 + movq 592(%rsp),%r12 + movq 600(%rsp),%r13 + movq 608(%rsp),%r14 + movq 616(%rsp),%r15 + movq 624(%rsp),%rdi + movq 632(%rsp),%rsi + + movq %r9,%rax + mulq %rax + movq %rax,%r9 + movq %rdx,%rbx + + addq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq %r15,%r15 + adcq %rdi,%rdi + adcq %rsi,%rsi + adcq $0,%rbx + + addq %rbp,%r10 + + movq 32(%rcx),%rax + mulq %rax + + addq %r8,%r10 + adcq %rax,%r11 + adcq $0,%rdx + + movq %rdx,%rbp + + movq %r10,576(%rsp) + movq %r11,584(%rsp) + + movq 40(%rcx),%rax + mulq %rax + + addq %rbp,%r12 + adcq %rax,%r13 + adcq $0,%rdx + + movq %rdx,%rbp + + movq %r12,592(%rsp) + movq %r13,600(%rsp) + + movq 48(%rcx),%rax + mulq %rax + + addq %rbp,%r14 + adcq %rax,%r15 + adcq $0,%rdx + + movq %r14,608(%rsp) + movq %r15,616(%rsp) + + addq %rdx,%rdi + adcq %r9,%rsi + adcq $0,%rbx + + movq %rdi,624(%rsp) + movq %rsi,632(%rsp) + movq %rbx,640(%rsp) + + jmp mont_reduce + + + +.globl _mod_exp_512 +.private_extern _mod_exp_512 + +_mod_exp_512: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + movq %rsp,%r8 + subq $2688,%rsp + andq $-64,%rsp + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rcx,24(%rsp) +L$body: + + + + pxor %xmm4,%xmm4 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqa %xmm4,512(%rsp) + movdqa %xmm4,528(%rsp) + movdqa %xmm4,608(%rsp) + movdqa %xmm4,624(%rsp) + movdqa %xmm0,544(%rsp) + movdqa %xmm1,560(%rsp) + movdqa %xmm2,576(%rsp) + movdqa %xmm3,592(%rsp) + + + movdqu 0(%rdx),%xmm0 + movdqu 16(%rdx),%xmm1 + movdqu 32(%rdx),%xmm2 + movdqu 48(%rdx),%xmm3 + + leaq 384(%rsp),%rbx + movq %rbx,136(%rsp) + call mont_reduce + + + leaq 448(%rsp),%rcx + xorq %rax,%rax + movq %rax,0(%rcx) + movq %rax,8(%rcx) + movq %rax,24(%rcx) + movq %rax,32(%rcx) + movq %rax,40(%rcx) + movq %rax,48(%rcx) + movq %rax,56(%rcx) + movq %rax,128(%rsp) + movq $1,16(%rcx) + + leaq 640(%rsp),%rbp + movq %rcx,%rsi + movq %rbp,%rdi + movq $8,%rax +loop_0: + movq (%rcx),%rbx + movw %bx,(%rdi) + shrq $16,%rbx + movw %bx,64(%rdi) + shrq $16,%rbx + movw %bx,128(%rdi) + shrq $16,%rbx + movw %bx,192(%rdi) + leaq 8(%rcx),%rcx + leaq 256(%rdi),%rdi + decq %rax + jnz loop_0 + movq $31,%rax + movq %rax,32(%rsp) + movq %rbp,40(%rsp) + + movq %rsi,136(%rsp) + movq 0(%rsi),%r10 + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 +init_loop: + leaq 384(%rsp),%rdi + call mont_mul_a3b + leaq 448(%rsp),%rsi + movq 40(%rsp),%rbp + addq $2,%rbp + movq %rbp,40(%rsp) + movq %rsi,%rcx + movq $8,%rax +loop_1: + movq (%rcx),%rbx + movw %bx,(%rbp) + shrq $16,%rbx + movw %bx,64(%rbp) + shrq $16,%rbx + movw %bx,128(%rbp) + shrq $16,%rbx + movw %bx,192(%rbp) + leaq 8(%rcx),%rcx + leaq 256(%rbp),%rbp + decq %rax + jnz loop_1 + movq 32(%rsp),%rax + subq $1,%rax + movq %rax,32(%rsp) + jne init_loop + + + + movdqa %xmm0,64(%rsp) + movdqa %xmm1,80(%rsp) + movdqa %xmm2,96(%rsp) + movdqa %xmm3,112(%rsp) + + + + + + movl 126(%rsp),%eax + movq %rax,%rdx + shrq $11,%rax + andl $2047,%edx + movl %edx,126(%rsp) + leaq 640(%rsp,%rax,2),%rsi + movq 8(%rsp),%rdx + movq $4,%rbp +loop_2: + movzwq 192(%rsi),%rbx + movzwq 448(%rsi),%rax + shlq $16,%rbx + shlq $16,%rax + movw 128(%rsi),%bx + movw 384(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 64(%rsi),%bx + movw 320(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 0(%rsi),%bx + movw 256(%rsi),%ax + movq %rbx,0(%rdx) + movq %rax,8(%rdx) + leaq 512(%rsi),%rsi + leaq 16(%rdx),%rdx + subq $1,%rbp + jnz loop_2 + movq $505,48(%rsp) + + movq 8(%rsp),%rcx + movq %rcx,136(%rsp) + movq 0(%rcx),%r10 + movq 8(%rcx),%r11 + movq 16(%rcx),%r12 + movq 24(%rcx),%r13 + movq 32(%rcx),%r14 + movq 40(%rcx),%r15 + movq 48(%rcx),%r8 + movq 56(%rcx),%r9 + jmp sqr_2 + +main_loop_a3b: + call sqr_reduce + call sqr_reduce + call sqr_reduce +sqr_2: + call sqr_reduce + call sqr_reduce + + + + movq 48(%rsp),%rcx + movq %rcx,%rax + shrq $4,%rax + movl 64(%rsp,%rax,2),%edx + andq $15,%rcx + shrq %cl,%rdx + andq $31,%rdx + + leaq 640(%rsp,%rdx,2),%rsi + leaq 448(%rsp),%rdx + movq %rdx,%rdi + movq $4,%rbp +loop_3: + movzwq 192(%rsi),%rbx + movzwq 448(%rsi),%rax + shlq $16,%rbx + shlq $16,%rax + movw 128(%rsi),%bx + movw 384(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 64(%rsi),%bx + movw 320(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 0(%rsi),%bx + movw 256(%rsi),%ax + movq %rbx,0(%rdx) + movq %rax,8(%rdx) + leaq 512(%rsi),%rsi + leaq 16(%rdx),%rdx + subq $1,%rbp + jnz loop_3 + movq 8(%rsp),%rsi + call mont_mul_a3b + + + + movq 48(%rsp),%rcx + subq $5,%rcx + movq %rcx,48(%rsp) + jge main_loop_a3b + + + +end_main_loop_a3b: + + + movq 8(%rsp),%rdx + pxor %xmm4,%xmm4 + movdqu 0(%rdx),%xmm0 + movdqu 16(%rdx),%xmm1 + movdqu 32(%rdx),%xmm2 + movdqu 48(%rdx),%xmm3 + movdqa %xmm4,576(%rsp) + movdqa %xmm4,592(%rsp) + movdqa %xmm4,608(%rsp) + movdqa %xmm4,624(%rsp) + movdqa %xmm0,512(%rsp) + movdqa %xmm1,528(%rsp) + movdqa %xmm2,544(%rsp) + movdqa %xmm3,560(%rsp) + call mont_reduce + + + + movq 8(%rsp),%rax + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%r12 + movq 40(%rax),%r13 + movq 48(%rax),%r14 + movq 56(%rax),%r15 + + + movq 24(%rsp),%rbx + addq $512,%rbx + + subq 0(%rbx),%r8 + sbbq 8(%rbx),%r9 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + sbbq 32(%rbx),%r12 + sbbq 40(%rbx),%r13 + sbbq 48(%rbx),%r14 + sbbq 56(%rbx),%r15 + + + movq 0(%rax),%rsi + movq 8(%rax),%rdi + movq 16(%rax),%rcx + movq 24(%rax),%rdx + cmovncq %r8,%rsi + cmovncq %r9,%rdi + cmovncq %r10,%rcx + cmovncq %r11,%rdx + movq %rsi,0(%rax) + movq %rdi,8(%rax) + movq %rcx,16(%rax) + movq %rdx,24(%rax) + + movq 32(%rax),%rsi + movq 40(%rax),%rdi + movq 48(%rax),%rcx + movq 56(%rax),%rdx + cmovncq %r12,%rsi + cmovncq %r13,%rdi + cmovncq %r14,%rcx + cmovncq %r15,%rdx + movq %rsi,32(%rax) + movq %rdi,40(%rax) + movq %rcx,48(%rax) + movq %rdx,56(%rax) + + movq 0(%rsp),%rsi + movq 0(%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbx + movq 40(%rsi),%rbp + leaq 48(%rsi),%rsp +L$epilogue: + .byte 0xf3,0xc3 + +#endif diff --git a/mac-x86_64/crypto/bn/rsaz-avx2.S b/mac-x86_64/crypto/bn/rsaz-avx2.S new file mode 100644 index 0000000..8ba2019 --- /dev/null +++ b/mac-x86_64/crypto/bn/rsaz-avx2.S @@ -0,0 +1,34 @@ +#if defined(__x86_64__) +.text + +.globl _rsaz_avx2_eligible +.private_extern _rsaz_avx2_eligible + +_rsaz_avx2_eligible: + xorl %eax,%eax + .byte 0xf3,0xc3 + + +.globl _rsaz_1024_sqr_avx2 +.private_extern _rsaz_1024_sqr_avx2 +.globl _rsaz_1024_mul_avx2 +.private_extern _rsaz_1024_mul_avx2 +.globl _rsaz_1024_norm2red_avx2 +.private_extern _rsaz_1024_norm2red_avx2 +.globl _rsaz_1024_red2norm_avx2 +.private_extern _rsaz_1024_red2norm_avx2 +.globl _rsaz_1024_scatter5_avx2 +.private_extern _rsaz_1024_scatter5_avx2 +.globl _rsaz_1024_gather5_avx2 +.private_extern _rsaz_1024_gather5_avx2 + +_rsaz_1024_sqr_avx2: +_rsaz_1024_mul_avx2: +_rsaz_1024_norm2red_avx2: +_rsaz_1024_red2norm_avx2: +_rsaz_1024_scatter5_avx2: +_rsaz_1024_gather5_avx2: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 + +#endif diff --git a/mac-x86_64/crypto/bn/rsaz-x86_64.S b/mac-x86_64/crypto/bn/rsaz-x86_64.S new file mode 100644 index 0000000..5e9e82f --- /dev/null +++ b/mac-x86_64/crypto/bn/rsaz-x86_64.S @@ -0,0 +1,1126 @@ +#if defined(__x86_64__) +.text + + + +.globl _rsaz_512_sqr +.private_extern _rsaz_512_sqr + +.p2align 5 +_rsaz_512_sqr: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +L$sqr_body: + movq %rdx,%rbp + movq (%rsi),%rdx + movq 8(%rsi),%rax + movq %rcx,128(%rsp) + jmp L$oop_sqr + +.p2align 5 +L$oop_sqr: + movl %r8d,128+8(%rsp) + + movq %rdx,%rbx + mulq %rdx + movq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + addq %rax,%r14 + movq %rbx,%rax + movq %rdx,%r15 + adcq $0,%r15 + + addq %r8,%r8 + movq %r9,%rcx + adcq %r9,%r9 + + mulq %rax + movq %rax,(%rsp) + addq %rdx,%r8 + adcq $0,%r9 + + movq %r8,8(%rsp) + shrq $63,%rcx + + + movq 8(%rsi),%r8 + movq 16(%rsi),%rax + mulq %r8 + addq %rax,%r10 + movq 24(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r11 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r12 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r13 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r14 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r8 + addq %rax,%r15 + movq %r8,%rax + adcq $0,%rdx + addq %rbx,%r15 + movq %rdx,%r8 + movq %r10,%rdx + adcq $0,%r8 + + addq %rdx,%rdx + leaq (%rcx,%r10,2),%r10 + movq %r11,%rbx + adcq %r11,%r11 + + mulq %rax + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %r9,16(%rsp) + movq %r10,24(%rsp) + shrq $63,%rbx + + + movq 16(%rsi),%r9 + movq 24(%rsi),%rax + mulq %r9 + addq %rax,%r12 + movq 32(%rsi),%rax + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + addq %rax,%r13 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r13 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + addq %rax,%r14 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r14 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + movq %r12,%r10 + leaq (%rbx,%r12,2),%r12 + addq %rax,%r15 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rcx,%r15 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r9 + shrq $63,%r10 + addq %rax,%r8 + movq %r9,%rax + adcq $0,%rdx + addq %rcx,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + movq %r13,%rcx + leaq (%r10,%r13,2),%r13 + + mulq %rax + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0,%r13 + + movq %r11,32(%rsp) + movq %r12,40(%rsp) + shrq $63,%rcx + + + movq 24(%rsi),%r10 + movq 32(%rsi),%rax + mulq %r10 + addq %rax,%r14 + movq 40(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + addq %rax,%r15 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r15 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + movq %r14,%r12 + leaq (%rcx,%r14,2),%r14 + addq %rax,%r8 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %rbx,%r8 + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r10 + shrq $63,%r12 + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + addq %rbx,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + movq %r15,%rbx + leaq (%r12,%r15,2),%r15 + + mulq %rax + addq %rax,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %r13,48(%rsp) + movq %r14,56(%rsp) + shrq $63,%rbx + + + movq 32(%rsi),%r11 + movq 40(%rsi),%rax + mulq %r11 + addq %rax,%r8 + movq 48(%rsi),%rax + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r11 + addq %rax,%r9 + movq 56(%rsi),%rax + adcq $0,%rdx + movq %r8,%r12 + leaq (%rbx,%r8,2),%r8 + addq %rcx,%r9 + movq %rdx,%rcx + adcq $0,%rcx + + mulq %r11 + shrq $63,%r12 + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + addq %rcx,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + movq %r9,%rcx + leaq (%r12,%r9,2),%r9 + + mulq %rax + addq %rax,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %r15,64(%rsp) + movq %r8,72(%rsp) + shrq $63,%rcx + + + movq 40(%rsi),%r12 + movq 48(%rsi),%rax + mulq %r12 + addq %rax,%r10 + movq 56(%rsi),%rax + movq %rdx,%rbx + adcq $0,%rbx + + mulq %r12 + addq %rax,%r11 + movq %r12,%rax + movq %r10,%r15 + leaq (%rcx,%r10,2),%r10 + adcq $0,%rdx + shrq $63,%r15 + addq %rbx,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + movq %r11,%rbx + leaq (%r15,%r11,2),%r11 + + mulq %rax + addq %rax,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %r9,80(%rsp) + movq %r10,88(%rsp) + + + movq 48(%rsi),%r13 + movq 56(%rsi),%rax + mulq %r13 + addq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + adcq $0,%r13 + + xorq %r14,%r14 + shlq $1,%rbx + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + + mulq %rax + addq %rax,%r11 + adcq %rdx,%r12 + adcq $0,%r13 + + movq %r11,96(%rsp) + movq %r12,104(%rsp) + + + movq 56(%rsi),%rax + mulq %rax + addq %rax,%r13 + adcq $0,%rdx + + addq %rdx,%r14 + + movq %r13,112(%rsp) + movq %r14,120(%rsp) + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movq %r8,%rdx + movq %r9,%rax + movl 128+8(%rsp),%r8d + movq %rdi,%rsi + + decl %r8d + jnz L$oop_sqr + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$sqr_epilogue: + .byte 0xf3,0xc3 + +.globl _rsaz_512_mul +.private_extern _rsaz_512_mul + +.p2align 5 +_rsaz_512_mul: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +L$mul_body: +.byte 102,72,15,110,199 +.byte 102,72,15,110,201 + movq %r8,128(%rsp) + movq (%rdx),%rbx + movq %rdx,%rbp + call __rsaz_512_mul + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$mul_epilogue: + .byte 0xf3,0xc3 + +.globl _rsaz_512_mul_gather4 +.private_extern _rsaz_512_mul_gather4 + +.p2align 5 +_rsaz_512_mul_gather4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + subq $128+24,%rsp +L$mul_gather4_body: + movl 64(%rdx,%r9,4),%eax +.byte 102,72,15,110,199 + movl (%rdx,%r9,4),%ebx +.byte 102,72,15,110,201 + movq %r8,128(%rsp) + + shlq $32,%rax + orq %rax,%rbx + movq (%rsi),%rax + movq 8(%rsi),%rcx + leaq 128(%rdx,%r9,4),%rbp + mulq %rbx + movq %rax,(%rsp) + movq %rcx,%rax + movq %rdx,%r8 + + mulq %rbx + movd (%rbp),%xmm4 + addq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + movd 64(%rbp),%xmm5 + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + pslldq $4,%xmm5 + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + por %xmm5,%xmm4 + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + leaq 128(%rbp),%rbp + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx +.byte 102,72,15,126,227 + addq %rax,%r14 + movq (%rsi),%rax + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rsp),%rdi + movl $7,%ecx + jmp L$oop_mul_gather + +.p2align 5 +L$oop_mul_gather: + mulq %rbx + addq %rax,%r8 + movq 8(%rsi),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + movd (%rbp),%xmm4 + addq %rax,%r9 + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + movd 64(%rbp),%xmm5 + addq %rax,%r10 + movq 24(%rsi),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + pslldq $4,%xmm5 + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + por %xmm5,%xmm4 + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx +.byte 102,72,15,126,227 + addq %rax,%r15 + movq (%rsi),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + leaq 128(%rbp),%rbp + leaq 8(%rdi),%rdi + + decl %ecx + jnz L$oop_mul_gather + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$mul_gather4_epilogue: + .byte 0xf3,0xc3 + +.globl _rsaz_512_mul_scatter4 +.private_extern _rsaz_512_mul_scatter4 + +.p2align 5 +_rsaz_512_mul_scatter4: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + subq $128+24,%rsp +L$mul_scatter4_body: + leaq (%r8,%r9,4),%r8 +.byte 102,72,15,110,199 +.byte 102,72,15,110,202 +.byte 102,73,15,110,208 + movq %rcx,128(%rsp) + + movq %rdi,%rbp + movq (%rdi),%rbx + call __rsaz_512_mul + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reduce + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 +.byte 102,72,15,126,214 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movl %r8d,0(%rsi) + shrq $32,%r8 + movl %r9d,128(%rsi) + shrq $32,%r9 + movl %r10d,256(%rsi) + shrq $32,%r10 + movl %r11d,384(%rsi) + shrq $32,%r11 + movl %r12d,512(%rsi) + shrq $32,%r12 + movl %r13d,640(%rsi) + shrq $32,%r13 + movl %r14d,768(%rsi) + shrq $32,%r14 + movl %r15d,896(%rsi) + shrq $32,%r15 + movl %r8d,64(%rsi) + movl %r9d,192(%rsi) + movl %r10d,320(%rsi) + movl %r11d,448(%rsi) + movl %r12d,576(%rsi) + movl %r13d,704(%rsi) + movl %r14d,832(%rsi) + movl %r15d,960(%rsi) + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$mul_scatter4_epilogue: + .byte 0xf3,0xc3 + +.globl _rsaz_512_mul_by_one +.private_extern _rsaz_512_mul_by_one + +.p2align 5 +_rsaz_512_mul_by_one: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $128+24,%rsp +L$mul_by_one_body: + movq %rdx,%rbp + movq %rcx,128(%rsp) + + movq (%rsi),%r8 + pxor %xmm0,%xmm0 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + + movdqa %xmm0,(%rsp) + movdqa %xmm0,16(%rsp) + movdqa %xmm0,32(%rsp) + movdqa %xmm0,48(%rsp) + movdqa %xmm0,64(%rsp) + movdqa %xmm0,80(%rsp) + movdqa %xmm0,96(%rsp) + call __rsaz_512_reduce + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 128+24+48(%rsp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$mul_by_one_epilogue: + .byte 0xf3,0xc3 + + +.p2align 5 +__rsaz_512_reduce: + movq %r8,%rbx + imulq 128+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp L$reduction_loop + +.p2align 5 +L$reduction_loop: + mulq %rbx + movq 8(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq 128+8(%rsp),%rsi + + + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jne L$reduction_loop + + .byte 0xf3,0xc3 + + +.p2align 5 +__rsaz_512_subtract: + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + movq 0(%rbp),%r8 + movq 8(%rbp),%r9 + negq %r8 + notq %r9 + andq %rcx,%r8 + movq 16(%rbp),%r10 + andq %rcx,%r9 + notq %r10 + movq 24(%rbp),%r11 + andq %rcx,%r10 + notq %r11 + movq 32(%rbp),%r12 + andq %rcx,%r11 + notq %r12 + movq 40(%rbp),%r13 + andq %rcx,%r12 + notq %r13 + movq 48(%rbp),%r14 + andq %rcx,%r13 + notq %r14 + movq 56(%rbp),%r15 + andq %rcx,%r14 + notq %r15 + andq %rcx,%r15 + + addq (%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 + + +.p2align 5 +__rsaz_512_mul: + leaq 8(%rsp),%rdi + + movq (%rsi),%rax + mulq %rbx + movq %rax,(%rdi) + movq 8(%rsi),%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r8 + movq 16(%rsi),%rax + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r9 + movq 24(%rsi),%rax + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r10 + movq 32(%rsi),%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r11 + movq 40(%rsi),%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r12 + movq 48(%rsi),%rax + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r13 + movq 56(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + addq %rax,%r14 + movq (%rsi),%rax + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rbp),%rbp + leaq 8(%rdi),%rdi + + movl $7,%ecx + jmp L$oop_mul + +.p2align 5 +L$oop_mul: + movq (%rbp),%rbx + mulq %rbx + addq %rax,%r8 + movq 8(%rsi),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rsi),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rsi),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rsi),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rsi),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rsi),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + leaq 8(%rbp),%rbp + adcq $0,%r14 + + mulq %rbx + addq %rax,%r15 + movq (%rsi),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + leaq 8(%rdi),%rdi + + decl %ecx + jnz L$oop_mul + + movq %r8,(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 + +.globl _rsaz_512_scatter4 +.private_extern _rsaz_512_scatter4 + +.p2align 4 +_rsaz_512_scatter4: + leaq (%rdi,%rdx,4),%rdi + movl $8,%r9d + jmp L$oop_scatter +.p2align 4 +L$oop_scatter: + movq (%rsi),%rax + leaq 8(%rsi),%rsi + movl %eax,(%rdi) + shrq $32,%rax + movl %eax,64(%rdi) + leaq 128(%rdi),%rdi + decl %r9d + jnz L$oop_scatter + .byte 0xf3,0xc3 + + +.globl _rsaz_512_gather4 +.private_extern _rsaz_512_gather4 + +.p2align 4 +_rsaz_512_gather4: + leaq (%rsi,%rdx,4),%rsi + movl $8,%r9d + jmp L$oop_gather +.p2align 4 +L$oop_gather: + movl (%rsi),%eax + movl 64(%rsi),%r8d + leaq 128(%rsi),%rsi + shlq $32,%r8 + orq %r8,%rax + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + decl %r9d + jnz L$oop_gather + .byte 0xf3,0xc3 + +#endif diff --git a/mac-x86_64/crypto/bn/x86_64-mont.S b/mac-x86_64/crypto/bn/x86_64-mont.S new file mode 100644 index 0000000..6b9bc05 --- /dev/null +++ b/mac-x86_64/crypto/bn/x86_64-mont.S @@ -0,0 +1,726 @@ +#if defined(__x86_64__) +.text + + + +.globl _bn_mul_mont +.private_extern _bn_mul_mont + +.p2align 4 +_bn_mul_mont: + testl $3,%r9d + jnz L$mul_enter + cmpl $8,%r9d + jb L$mul_enter + cmpq %rsi,%rdx + jne L$mul4x_enter + testl $7,%r9d + jz L$sqr8x_enter + jmp L$mul4x_enter + +.p2align 4 +L$mul_enter: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + leaq 2(%r9),%r10 + movq %rsp,%r11 + negq %r10 + leaq (%rsp,%r10,8),%rsp + andq $-1024,%rsp + + movq %r11,8(%rsp,%r9,8) +L$mul_body: + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$1st_enter + +.p2align 4 +L$1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne L$1st + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp L$outer +.p2align 4 +L$outer: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$inner_enter + +.p2align 4 +L$inner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$inner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne L$inner + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb L$outer + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp L$sub +.p2align 4 +L$sub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz L$sub + + sbbq $0,%rax + xorq %r14,%r14 + movq %r9,%r15 +.p2align 4 +L$copy: + movq (%rsp,%r14,8),%rsi + movq (%rdi,%r14,8),%rcx + xorq %rcx,%rsi + andq %rax,%rsi + xorq %rcx,%rsi + movq %r14,(%rsp,%r14,8) + movq %rsi,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz L$copy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$mul_epilogue: + .byte 0xf3,0xc3 + + +.p2align 4 +bn_mul4x_mont: +L$mul4x_enter: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + leaq 4(%r9),%r10 + movq %rsp,%r11 + negq %r10 + leaq (%rsp,%r10,8),%rsp + andq $-1024,%rsp + + movq %r11,8(%rsp,%r9,8) +L$mul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp L$1st4x +.p2align 4 +L$1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb L$1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.p2align 2 +L$outer4x: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp L$inner4x +.p2align 4 +L$inner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb L$inner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jb L$outer4x + movq 16(%rsp,%r9,8),%rdi + movq 0(%rsp),%rax + movq 8(%rsp),%rdx + shrq $2,%r9 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + leaq -1(%r9),%r15 + jmp L$sub4x +.p2align 4 +L$sub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz L$sub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rax,%xmm0 + punpcklqdq %xmm0,%xmm0 + movq %rbp,24(%rdi,%r14,8) + xorq %r14,%r14 + + movq %r9,%r15 + pxor %xmm5,%xmm5 + jmp L$copy4x +.p2align 4 +L$copy4x: + movdqu (%rsp,%r14,1),%xmm2 + movdqu 16(%rsp,%r14,1),%xmm4 + movdqu (%rdi,%r14,1),%xmm1 + movdqu 16(%rdi,%r14,1),%xmm3 + pxor %xmm1,%xmm2 + pxor %xmm3,%xmm4 + pand %xmm0,%xmm2 + pand %xmm0,%xmm4 + pxor %xmm1,%xmm2 + pxor %xmm3,%xmm4 + movdqu %xmm2,(%rdi,%r14,1) + movdqu %xmm4,16(%rdi,%r14,1) + movdqa %xmm5,(%rsp,%r14,1) + movdqa %xmm5,16(%rsp,%r14,1) + + leaq 32(%r14),%r14 + decq %r15 + jnz L$copy4x + + shlq $2,%r9 + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$mul4x_epilogue: + .byte 0xf3,0xc3 + + + + +.p2align 5 +bn_sqr8x_mont: +L$sqr8x_enter: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r10d + shll $3,%r9d + shlq $3+2,%r10 + negq %r9 + + + + + + + leaq -64(%rsp,%r9,4),%r11 + movq (%r8),%r8 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$sqr8x_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,4),%rsp + jmp L$sqr8x_sp_done + +.p2align 5 +L$sqr8x_sp_alt: + leaq 4096-64(,%r9,4),%r10 + leaq -64(%rsp,%r9,4),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +L$sqr8x_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + leaq 64(%rsp,%r9,2),%r11 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +L$sqr8x_body: + + movq %r9,%rbp +.byte 102,73,15,110,211 + shrq $3+2,%rbp + movl _OPENSSL_ia32cap_P+8(%rip),%eax + jmp L$sqr8x_copy_n + +.p2align 5 +L$sqr8x_copy_n: + movq 0(%rcx),%xmm0 + movq 8(%rcx),%xmm1 + movq 16(%rcx),%xmm3 + movq 24(%rcx),%xmm4 + leaq 32(%rcx),%rcx + movdqa %xmm0,0(%r11) + movdqa %xmm1,16(%r11) + movdqa %xmm3,32(%r11) + movdqa %xmm4,48(%r11) + leaq 64(%r11),%r11 + decq %rbp + jnz L$sqr8x_copy_n + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,73,15,110,218 + call _bn_sqr8x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + leaq 64(%rsp,%r9,2),%rdx + shrq $3+2,%r9 + movq 40(%rsp),%rsi + jmp L$sqr8x_zero + +.p2align 5 +L$sqr8x_zero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + movdqa %xmm0,32(%rax) + movdqa %xmm0,48(%rax) + leaq 64(%rax),%rax + movdqa %xmm0,0(%rdx) + movdqa %xmm0,16(%rdx) + movdqa %xmm0,32(%rdx) + movdqa %xmm0,48(%rdx) + leaq 64(%rdx),%rdx + decq %r9 + jnz L$sqr8x_zero + + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +L$sqr8x_epilogue: + .byte 0xf3,0xc3 + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 4 +#endif diff --git a/mac-x86_64/crypto/bn/x86_64-mont5.S b/mac-x86_64/crypto/bn/x86_64-mont5.S new file mode 100644 index 0000000..2e8f469 --- /dev/null +++ b/mac-x86_64/crypto/bn/x86_64-mont5.S @@ -0,0 +1,1822 @@ +#if defined(__x86_64__) +.text + + + +.globl _bn_mul_mont_gather5 +.private_extern _bn_mul_mont_gather5 + +.p2align 6 +_bn_mul_mont_gather5: + testl $7,%r9d + jnz L$mul_enter + jmp L$mul4x_enter + +.p2align 4 +L$mul_enter: + movl %r9d,%r9d + movq %rsp,%rax + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + leaq 2(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +L$mul_body: + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq L$magic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$1st_enter + +.p2align 4 +L$1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne L$1st + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp L$outer +.p2align 4 +L$outer: + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$inner_enter + +.p2align 4 +L$inner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$inner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne L$inner + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb L$outer + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp L$sub +.p2align 4 +L$sub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz L$sub + + sbbq $0,%rax + xorq %r14,%r14 + movq %r9,%r15 +.p2align 4 +L$copy: + movq (%rsp,%r14,8),%rsi + movq (%rdi,%r14,8),%rcx + xorq %rcx,%rsi + andq %rax,%rsi + xorq %rcx,%rsi + movq %r14,(%rsp,%r14,8) + movq %rsi,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz L$copy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +L$mul_epilogue: + .byte 0xf3,0xc3 + + +.p2align 5 +bn_mul4x_mont_gather5: +L$mul4x_enter: +.byte 0x67 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +.byte 0x67 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$mul4xsp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp L$mul4xsp_done + +.p2align 5 +L$mul4xsp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +L$mul4xsp_done: + andq $-64,%rsp + negq %r9 + + movq %rax,40(%rsp) +L$mul4x_body: + + call mul4x_internal + + movq 40(%rsp),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +L$mul4x_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +mul4x_internal: + shlq $5,%r9 + movl 8(%rax),%r10d + leaq 256(%rdx,%r9,1),%r13 + shrq $5,%r9 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq L$magic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%rdx,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + addq $7,%r11 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + andq $7,%r11 + + movq -96(%r12),%xmm0 + leaq 256(%r12),%r14 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 +.byte 0x67 + por %xmm1,%xmm0 + movq -96(%r14),%xmm1 +.byte 0x67 + pand %xmm7,%xmm3 +.byte 0x67 + por %xmm2,%xmm0 + movq -32(%r14),%xmm2 +.byte 0x67 + pand %xmm4,%xmm1 +.byte 0x67 + por %xmm3,%xmm0 + movq 32(%r14),%xmm3 + +.byte 102,72,15,126,195 + movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) + movq %rdi,56+8(%rsp) + + movq (%r8),%r8 + movq (%rsi),%rax + leaq (%rsi,%r9,1),%rsi + negq %r9 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + pand %xmm5,%xmm2 + pand %xmm6,%xmm3 + por %xmm2,%xmm1 + + imulq %r10,%rbp + + + + + + + + leaq 64+8(%rsp,%r11,8),%r14 + movq %rdx,%r11 + + pand %xmm7,%xmm0 + por %xmm3,%xmm1 + leaq 512(%r12),%r12 + por %xmm1,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + jmp L$1st4x + +.p2align 5 +L$1st4x: + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz L$1st4x + + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -16(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + leaq (%rcx,%r9,2),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + jmp L$outer4x + +.p2align 5 +L$outer4x: + movq (%r14,%r9,1),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + + imulq %r10,%rbp +.byte 0x67 + movq %rdx,%r11 + movq %rdi,(%r14) + + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq (%r14,%r9,1),%r14 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %rdx,%r13 + jmp L$inner4x + +.p2align 5 +L$inner4x: + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -16(%rcx),%rax + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + addq (%r14),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 16(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 64(%rcx),%rcx + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz L$inner4x + + mulq %rbx + addq %rax,%r10 + movq -32(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq %rbp,%rax + movq -16(%rcx),%rbp + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + movq %rdi,-16(%r14) + leaq (%rcx,%r9,2),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%r14),%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + cmpq 16+8(%rsp),%r12 + jb L$outer4x + subq %r13,%rbp + adcq %r15,%r15 + orq %r15,%rdi + xorq $1,%rdi + leaq (%r14,%r9,1),%rbx + leaq (%rcx,%rdi,8),%rbp + movq %r9,%rcx + sarq $3+2,%rcx + movq 56+8(%rsp),%rdi + jmp L$sqr4x_sub + +.globl _bn_power5 +.private_extern _bn_power5 + +.p2align 5 +_bn_power5: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$pwr_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp L$pwr_sp_done + +.p2align 5 +L$pwr_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +L$pwr_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +L$power5_body: +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq %rsi,%rdi + movq 40(%rsp),%rax + leaq 32(%rsp),%r8 + + call mul4x_internal + + movq 40(%rsp),%rsi + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +L$power5_epilogue: + .byte 0xf3,0xc3 + + +.globl _bn_sqr8x_internal +.private_extern _bn_sqr8x_internal +.private_extern _bn_sqr8x_internal + +.p2align 5 +_bn_sqr8x_internal: +__bn_sqr8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + movq %r11,-16(%rdi,%rbp,1) + movq %rdx,%r10 + + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + leaq (%rbp),%rcx + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp L$sqr4x_1st + +.p2align 5 +L$sqr4x_1st: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq 16(%rsi,%rcx,1),%rbx + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %r10,8(%rdi,%rcx,1) + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 24(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,16(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + leaq 32(%rcx),%rcx + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne L$sqr4x_1st + + mulq %r15 + addq %rax,%r13 + leaq 16(%rbp),%rbp + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + jmp L$sqr4x_outer + +.p2align 5 +L$sqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq -24(%rdi,%rbp,1),%r10 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + movq %r10,-24(%rdi,%rbp,1) + movq %rdx,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + addq -16(%rdi,%rbp,1),%r11 + movq %rdx,%r10 + adcq $0,%r10 + movq %r11,-16(%rdi,%rbp,1) + + xorq %r12,%r12 + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + addq -8(%rdi,%rbp,1),%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rbp,1) + + leaq (%rbp),%rcx + jmp L$sqr4x_inner + +.p2align 5 +L$sqr4x_inner: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + +.byte 0x67 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %r11,(%rdi,%rcx,1) + movq %rbx,%rax + movq %rdx,%r13 + adcq $0,%r13 + addq 8(%rdi,%rcx,1),%r12 + leaq 16(%rcx),%rcx + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne L$sqr4x_inner + +.byte 0x67 + mulq %r15 + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + addq $16,%rbp + jnz L$sqr4x_outer + + + movq -32(%rsi),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq %r10,-24(%rdi) + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + movq -8(%rsi),%rbx + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,-16(%rdi) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi) + + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + mulq %rbx + addq $16,%rbp + xorq %r14,%r14 + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 48+8(%rsp),%rdi + xorq %r10,%r10 + movq 8(%rdi),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + jmp L$sqr4x_shift_n_add + +.p2align 5 +L$sqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + addq $32,%rbp + jnz L$sqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 +.byte 0x67 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) +.byte 102,72,15,126,213 +sqr8x_reduction: + xorq %rax,%rax + leaq (%rbp,%r9,2),%rcx + leaq 48+8(%rsp,%r9,2),%rdx + movq %rcx,0+8(%rsp) + leaq 48+8(%rsp,%r9,1),%rdi + movq %rdx,8+8(%rsp) + negq %r9 + jmp L$8x_reduction_loop + +.p2align 5 +L$8x_reduction_loop: + leaq (%rdi,%r9,1),%rdi +.byte 0x66 + movq 0(%rdi),%rbx + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,(%rdx) + leaq 64(%rdi),%rdi + +.byte 0x67 + movq %rbx,%r8 + imulq 32+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp L$8x_reduce + +.p2align 5 +L$8x_reduce: + mulq %rbx + movq 16(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rbx,48-8+8(%rsp,%rcx,8) + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq 32+8(%rsp),%rsi + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 64(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 80(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 96(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 112(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz L$8x_reduce + + leaq 128(%rbp),%rbp + xorq %rax,%rax + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae L$8x_no_tail + +.byte 0x66 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movq 48+56+8(%rsp),%rbx + movl $8,%ecx + movq 0(%rbp),%rax + jmp L$8x_tail + +.p2align 5 +L$8x_tail: + mulq %rbx + addq %rax,%r8 + movq 16(%rbp),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + leaq 8(%rdi),%rdi + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 64(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 80(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 96(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 112(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq 48-16+8(%rsp,%rcx,8),%rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r15,%r14 + movq 0(%rbp),%rax + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz L$8x_tail + + leaq 128(%rbp),%rbp + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae L$8x_tail_done + + movq 48+56+8(%rsp),%rbx + negq %rsi + movq 0(%rbp),%rax + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movl $8,%ecx + jmp L$8x_tail + +.p2align 5 +L$8x_tail_done: + addq (%rdx),%r8 + xorq %rax,%rax + + negq %rsi +L$8x_no_tail: + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + movq -16(%rbp),%rcx + xorq %rsi,%rsi + +.byte 102,72,15,126,213 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) +.byte 102,73,15,126,217 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rdi),%rdi + + cmpq %rdx,%rdi + jb L$8x_reduction_loop + + subq %r15,%rcx + leaq (%rdi,%r9,1),%rbx + adcq %rsi,%rsi + movq %r9,%rcx + orq %rsi,%rax +.byte 102,72,15,126,207 + xorq $1,%rax +.byte 102,72,15,126,206 + leaq (%rbp,%rax,8),%rbp + sarq $3+2,%rcx + jmp L$sqr4x_sub + +.p2align 5 +L$sqr4x_sub: +.byte 0x66 + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + sbbq 0(%rbp),%r12 + movq 16(%rbx),%r14 + sbbq 16(%rbp),%r13 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 32(%rbp),%r14 + movq %r12,0(%rdi) + sbbq 48(%rbp),%r15 + leaq 64(%rbp),%rbp + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + + incq %rcx + jnz L$sqr4x_sub + movq %r9,%r10 + negq %r9 + .byte 0xf3,0xc3 + +.globl _bn_from_montgomery +.private_extern _bn_from_montgomery + +.p2align 5 +_bn_from_montgomery: + testl $7,%r9d + jz bn_from_mont8x + xorl %eax,%eax + .byte 0xf3,0xc3 + + + +.p2align 5 +bn_from_mont8x: +.byte 0x67 + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +.byte 0x67 + movl %r9d,%r10d + shll $3,%r9d + shll $3+2,%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + leaq -64(%rsp,%r9,2),%r11 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$from_sp_alt + subq %r11,%rsp + leaq -64(%rsp,%r9,2),%rsp + jmp L$from_sp_done + +.p2align 5 +L$from_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rsp +L$from_sp_done: + andq $-64,%rsp + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +L$from_body: + movq %r9,%r11 + leaq 48(%rsp),%rax + pxor %xmm0,%xmm0 + jmp L$mul_by_1 + +.p2align 5 +L$mul_by_1: + movdqu (%rsi),%xmm1 + movdqu 16(%rsi),%xmm2 + movdqu 32(%rsi),%xmm3 + movdqa %xmm0,(%rax,%r9,1) + movdqu 48(%rsi),%xmm4 + movdqa %xmm0,16(%rax,%r9,1) +.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 + movdqa %xmm1,(%rax) + movdqa %xmm0,32(%rax,%r9,1) + movdqa %xmm2,16(%rax) + movdqa %xmm0,48(%rax,%r9,1) + movdqa %xmm3,32(%rax) + movdqa %xmm4,48(%rax) + leaq 64(%rax),%rax + subq $64,%r11 + jnz L$mul_by_1 + +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 0x67 + movq %rcx,%rbp +.byte 102,73,15,110,218 + call sqr8x_reduction + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + movq 40(%rsp),%rsi + jmp L$from_mont_zero + +.p2align 5 +L$from_mont_zero: + movdqa %xmm0,0(%rax) + movdqa %xmm0,16(%rax) + movdqa %xmm0,32(%rax) + movdqa %xmm0,48(%rax) + leaq 64(%rax),%rax + subq $32,%r9 + jnz L$from_mont_zero + + movq $1,%rax + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +L$from_epilogue: + .byte 0xf3,0xc3 + +.globl _bn_scatter5 +.private_extern _bn_scatter5 + +.p2align 4 +_bn_scatter5: + cmpl $0,%esi + jz L$scatter_epilogue + leaq (%rdx,%rcx,8),%rdx +L$scatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz L$scatter +L$scatter_epilogue: + .byte 0xf3,0xc3 + + +.globl _bn_gather5 +.private_extern _bn_gather5 + +.p2align 4 +_bn_gather5: + movl %ecx,%r11d + shrl $3,%ecx + andq $7,%r11 + notl %ecx + leaq L$magic_masks(%rip),%rax + andl $3,%ecx + leaq 128(%rdx,%r11,8),%rdx + movq 0(%rax,%rcx,8),%xmm4 + movq 8(%rax,%rcx,8),%xmm5 + movq 16(%rax,%rcx,8),%xmm6 + movq 24(%rax,%rcx,8),%xmm7 + jmp L$gather +.p2align 4 +L$gather: + movq -128(%rdx),%xmm0 + movq -64(%rdx),%xmm1 + pand %xmm4,%xmm0 + movq 0(%rdx),%xmm2 + pand %xmm5,%xmm1 + movq 64(%rdx),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 +.byte 0x67,0x67 + por %xmm2,%xmm0 + leaq 256(%rdx),%rdx + por %xmm3,%xmm0 + + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz L$gather + .byte 0xf3,0xc3 +L$SEH_end_bn_gather5: + +.p2align 6 +L$magic_masks: +.long 0,0, 0,0, 0,0, -1,-1 +.long 0,0, 0,0, 0,0, 0,0 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +#endif |