diff options
Diffstat (limited to 'mac-x86_64')
-rw-r--r-- | mac-x86_64/crypto/bn/modexp512-x86_64.S | 1776 | ||||
-rw-r--r-- | mac-x86_64/crypto/cpu-x86_64-asm.S | 143 |
2 files changed, 1919 insertions, 0 deletions
diff --git a/mac-x86_64/crypto/bn/modexp512-x86_64.S b/mac-x86_64/crypto/bn/modexp512-x86_64.S new file mode 100644 index 0000000..beb133e --- /dev/null +++ b/mac-x86_64/crypto/bn/modexp512-x86_64.S @@ -0,0 +1,1776 @@ +#if defined(__x86_64__) +.text + + +.p2align 4 +MULADD_128x512: + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + movq %r8,0(%rcx) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq 8(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + movq %r9,8(%rcx) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 + + +.p2align 4 +mont_reduce: + leaq 192(%rsp),%rdi + movq 32(%rsp),%rsi + addq $576,%rsi + leaq 520(%rsp),%rcx + + movq 96(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + movq (%rcx),%r8 + addq %rax,%r8 + adcq $0,%rdx + movq %r8,0(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + movq 8(%rcx),%r9 + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + movq 16(%rcx),%r10 + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + movq 24(%rcx),%r11 + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + movq 32(%rcx),%r12 + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + movq 40(%rcx),%r13 + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + movq 48(%rcx),%r14 + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + movq 56(%rcx),%r15 + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq 104(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%r9 + movq 112(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%r10 + movq 120(%rcx),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%r11 + xorq %rax,%rax + + addq 64(%rcx),%r8 + adcq 72(%rcx),%r9 + adcq 80(%rcx),%r10 + adcq 88(%rcx),%r11 + adcq $0,%rax + + + + + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,%rbp + movq %r11,88(%rdi) + + movq %rax,384(%rsp) + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + + + + + + + + addq $80,%rdi + + addq $64,%rsi + leaq 296(%rsp),%rcx + + call MULADD_128x512 + + movq 384(%rsp),%rax + + + addq -16(%rdi),%r8 + adcq -8(%rdi),%r9 + movq %r8,64(%rcx) + movq %r9,72(%rcx) + + adcq %rax,%rax + movq %rax,384(%rsp) + + leaq 192(%rsp),%rdi + addq $64,%rsi + + + + + + movq (%rsi),%r8 + movq 8(%rsi),%rbx + + movq (%rcx),%rax + mulq %r8 + movq %rax,%rbp + movq %rdx,%r9 + + movq 8(%rcx),%rax + mulq %r8 + addq %rax,%r9 + + movq (%rcx),%rax + mulq %rbx + addq %rax,%r9 + + movq %r9,8(%rdi) + + + subq $192,%rsi + + movq (%rcx),%r8 + movq 8(%rcx),%r9 + + call MULADD_128x512 + + + + + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 16(%rsi),%rdi + movq 24(%rsi),%rdx + + + movq 384(%rsp),%rbp + + addq 64(%rcx),%r8 + adcq 72(%rcx),%r9 + + + adcq %rbp,%rbp + + + + shlq $3,%rbp + movq 32(%rsp),%rcx + addq %rcx,%rbp + + + xorq %rsi,%rsi + + addq 0(%rbp),%r10 + adcq 64(%rbp),%r11 + adcq 128(%rbp),%r12 + adcq 192(%rbp),%r13 + adcq 256(%rbp),%r14 + adcq 320(%rbp),%r15 + adcq 384(%rbp),%r8 + adcq 448(%rbp),%r9 + + + + sbbq $0,%rsi + + + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rdi + andq %rsi,%rdx + + movq $1,%rbp + subq %rax,%r10 + sbbq %rbx,%r11 + sbbq %rdi,%r12 + sbbq %rdx,%r13 + + + + + sbbq $0,%rbp + + + + addq $512,%rcx + movq 32(%rcx),%rax + movq 40(%rcx),%rbx + movq 48(%rcx),%rdi + movq 56(%rcx),%rdx + + + + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rdi + andq %rsi,%rdx + + + + subq $1,%rbp + + sbbq %rax,%r14 + sbbq %rbx,%r15 + sbbq %rdi,%r8 + sbbq %rdx,%r9 + + + + movq 144(%rsp),%rsi + movq %r10,0(%rsi) + movq %r11,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq %r8,48(%rsi) + movq %r9,56(%rsi) + + .byte 0xf3,0xc3 + + +.p2align 4 +mont_mul_a3b: + + + + + movq 0(%rdi),%rbp + + movq %r10,%rax + mulq %rbp + movq %rax,520(%rsp) + movq %rdx,%r10 + movq %r11,%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r11 + movq %r12,%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %r13,%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + movq %rdx,%r13 + movq %r14,%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + movq %rdx,%r14 + movq %r15,%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r15 + movq %r8,%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq %r9,%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + movq %rdx,%r9 + movq 8(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + movq %r10,528(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%r10 + movq 16(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + movq %r11,536(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%r11 + movq 24(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + movq %r12,544(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq 32(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + movq %r13,552(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%r13 + movq 40(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + movq %r14,560(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%r14 + movq 48(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + movq %r15,568(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + addq %rbx,%r8 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%r15 + movq 56(%rdi),%rbp + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r8 + adcq $0,%rdx + movq %r8,576(%rsp) + movq %rdx,%rbx + + movq 8(%rsi),%rax + mulq %rbp + addq %rax,%r9 + adcq $0,%rdx + addq %rbx,%r9 + adcq $0,%rdx + movq %rdx,%rbx + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%rdx + addq %rbx,%r10 + adcq $0,%rdx + movq %rdx,%rbx + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r11 + adcq $0,%rdx + addq %rbx,%r11 + adcq $0,%rdx + movq %rdx,%rbx + + movq 32(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + addq %rbx,%r12 + adcq $0,%rdx + movq %rdx,%rbx + + movq 40(%rsi),%rax + mulq %rbp + addq %rax,%r13 + adcq $0,%rdx + addq %rbx,%r13 + adcq $0,%rdx + movq %rdx,%rbx + + movq 48(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%rdx + addq %rbx,%r14 + adcq $0,%rdx + movq %rdx,%rbx + + movq 56(%rsi),%rax + mulq %rbp + addq %rax,%r15 + adcq $0,%rdx + addq %rbx,%r15 + adcq $0,%rdx + movq %rdx,%r8 + movq %r9,584(%rsp) + movq %r10,592(%rsp) + movq %r11,600(%rsp) + movq %r12,608(%rsp) + movq %r13,616(%rsp) + movq %r14,624(%rsp) + movq %r15,632(%rsp) + movq %r8,640(%rsp) + + + + + + jmp mont_reduce + + + + +.p2align 4 +sqr_reduce: + movq 16(%rsp),%rcx + + + + movq %r10,%rbx + + movq %r11,%rax + mulq %rbx + movq %rax,528(%rsp) + movq %rdx,%r10 + movq %r12,%rax + mulq %rbx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r11 + movq %r13,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %r14,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + movq %rdx,%r13 + movq %r15,%rax + mulq %rbx + addq %rax,%r13 + adcq $0,%rdx + movq %rdx,%r14 + movq %r8,%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r15 + movq %r9,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + movq %rdx,%rsi + + movq %r10,536(%rsp) + + + + + + movq 8(%rcx),%rbx + + movq 16(%rcx),%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + movq %r11,544(%rsp) + + movq %rdx,%r10 + movq 24(%rcx),%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %r12,552(%rsp) + + movq %rdx,%r10 + movq 32(%rcx),%rax + mulq %rbx + addq %rax,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + + movq %rdx,%r10 + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%rsi + adcq $0,%rdx + addq %r10,%rsi + adcq $0,%rdx + + movq %rdx,%r11 + + + + + movq 16(%rcx),%rbx + + movq 24(%rcx),%rax + mulq %rbx + addq %rax,%r13 + adcq $0,%rdx + movq %r13,560(%rsp) + + movq %rdx,%r10 + movq 32(%rcx),%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %r14,568(%rsp) + + movq %rdx,%r10 + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%rsi + adcq $0,%rdx + addq %r10,%rsi + adcq $0,%rdx + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + addq %r10,%r11 + adcq $0,%rdx + + movq %rdx,%r12 + + + + + + movq 24(%rcx),%rbx + + movq 32(%rcx),%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + movq %r15,576(%rsp) + + movq %rdx,%r10 + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%rsi + adcq $0,%rdx + addq %r10,%rsi + adcq $0,%rdx + movq %rsi,584(%rsp) + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + addq %r10,%r11 + adcq $0,%rdx + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + + movq %rdx,%r15 + + + + + movq 32(%rcx),%rbx + + movq 40(%rcx),%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + movq %r11,592(%rsp) + + movq %rdx,%r10 + movq %r8,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %r12,600(%rsp) + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + + movq %rdx,%r11 + + + + + movq 40(%rcx),%rbx + + movq %r8,%rax + mulq %rbx + addq %rax,%r15 + adcq $0,%rdx + movq %r15,608(%rsp) + + movq %rdx,%r10 + movq %r9,%rax + mulq %rbx + addq %rax,%r11 + adcq $0,%rdx + addq %r10,%r11 + adcq $0,%rdx + movq %r11,616(%rsp) + + movq %rdx,%r12 + + + + + movq %r8,%rbx + + movq %r9,%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%rdx + movq %r12,624(%rsp) + + movq %rdx,632(%rsp) + + + movq 528(%rsp),%r10 + movq 536(%rsp),%r11 + movq 544(%rsp),%r12 + movq 552(%rsp),%r13 + movq 560(%rsp),%r14 + movq 568(%rsp),%r15 + + movq 24(%rcx),%rax + mulq %rax + movq %rax,%rdi + movq %rdx,%r8 + + addq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r8 + + movq 0(%rcx),%rax + mulq %rax + movq %rax,520(%rsp) + movq %rdx,%rbx + + movq 8(%rcx),%rax + mulq %rax + + addq %rbx,%r10 + adcq %rax,%r11 + adcq $0,%rdx + + movq %rdx,%rbx + movq %r10,528(%rsp) + movq %r11,536(%rsp) + + movq 16(%rcx),%rax + mulq %rax + + addq %rbx,%r12 + adcq %rax,%r13 + adcq $0,%rdx + + movq %rdx,%rbx + + movq %r12,544(%rsp) + movq %r13,552(%rsp) + + xorq %rbp,%rbp + addq %rbx,%r14 + adcq %rdi,%r15 + adcq $0,%rbp + + movq %r14,560(%rsp) + movq %r15,568(%rsp) + + + + + movq 576(%rsp),%r10 + movq 584(%rsp),%r11 + movq 592(%rsp),%r12 + movq 600(%rsp),%r13 + movq 608(%rsp),%r14 + movq 616(%rsp),%r15 + movq 624(%rsp),%rdi + movq 632(%rsp),%rsi + + movq %r9,%rax + mulq %rax + movq %rax,%r9 + movq %rdx,%rbx + + addq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq %r15,%r15 + adcq %rdi,%rdi + adcq %rsi,%rsi + adcq $0,%rbx + + addq %rbp,%r10 + + movq 32(%rcx),%rax + mulq %rax + + addq %r8,%r10 + adcq %rax,%r11 + adcq $0,%rdx + + movq %rdx,%rbp + + movq %r10,576(%rsp) + movq %r11,584(%rsp) + + movq 40(%rcx),%rax + mulq %rax + + addq %rbp,%r12 + adcq %rax,%r13 + adcq $0,%rdx + + movq %rdx,%rbp + + movq %r12,592(%rsp) + movq %r13,600(%rsp) + + movq 48(%rcx),%rax + mulq %rax + + addq %rbp,%r14 + adcq %rax,%r15 + adcq $0,%rdx + + movq %r14,608(%rsp) + movq %r15,616(%rsp) + + addq %rdx,%rdi + adcq %r9,%rsi + adcq $0,%rbx + + movq %rdi,624(%rsp) + movq %rsi,632(%rsp) + movq %rbx,640(%rsp) + + jmp mont_reduce + + + +.globl _mod_exp_512 +.private_extern _mod_exp_512 + +_mod_exp_512: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + + movq %rsp,%r8 + subq $2688,%rsp + andq $-64,%rsp + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rcx,24(%rsp) +L$body: + + + + pxor %xmm4,%xmm4 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqa %xmm4,512(%rsp) + movdqa %xmm4,528(%rsp) + movdqa %xmm4,608(%rsp) + movdqa %xmm4,624(%rsp) + movdqa %xmm0,544(%rsp) + movdqa %xmm1,560(%rsp) + movdqa %xmm2,576(%rsp) + movdqa %xmm3,592(%rsp) + + + movdqu 0(%rdx),%xmm0 + movdqu 16(%rdx),%xmm1 + movdqu 32(%rdx),%xmm2 + movdqu 48(%rdx),%xmm3 + + leaq 384(%rsp),%rbx + movq %rbx,136(%rsp) + call mont_reduce + + + leaq 448(%rsp),%rcx + xorq %rax,%rax + movq %rax,0(%rcx) + movq %rax,8(%rcx) + movq %rax,24(%rcx) + movq %rax,32(%rcx) + movq %rax,40(%rcx) + movq %rax,48(%rcx) + movq %rax,56(%rcx) + movq %rax,128(%rsp) + movq $1,16(%rcx) + + leaq 640(%rsp),%rbp + movq %rcx,%rsi + movq %rbp,%rdi + movq $8,%rax +loop_0: + movq (%rcx),%rbx + movw %bx,(%rdi) + shrq $16,%rbx + movw %bx,64(%rdi) + shrq $16,%rbx + movw %bx,128(%rdi) + shrq $16,%rbx + movw %bx,192(%rdi) + leaq 8(%rcx),%rcx + leaq 256(%rdi),%rdi + decq %rax + jnz loop_0 + movq $31,%rax + movq %rax,32(%rsp) + movq %rbp,40(%rsp) + + movq %rsi,136(%rsp) + movq 0(%rsi),%r10 + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 +init_loop: + leaq 384(%rsp),%rdi + call mont_mul_a3b + leaq 448(%rsp),%rsi + movq 40(%rsp),%rbp + addq $2,%rbp + movq %rbp,40(%rsp) + movq %rsi,%rcx + movq $8,%rax +loop_1: + movq (%rcx),%rbx + movw %bx,(%rbp) + shrq $16,%rbx + movw %bx,64(%rbp) + shrq $16,%rbx + movw %bx,128(%rbp) + shrq $16,%rbx + movw %bx,192(%rbp) + leaq 8(%rcx),%rcx + leaq 256(%rbp),%rbp + decq %rax + jnz loop_1 + movq 32(%rsp),%rax + subq $1,%rax + movq %rax,32(%rsp) + jne init_loop + + + + movdqa %xmm0,64(%rsp) + movdqa %xmm1,80(%rsp) + movdqa %xmm2,96(%rsp) + movdqa %xmm3,112(%rsp) + + + + + + movl 126(%rsp),%eax + movq %rax,%rdx + shrq $11,%rax + andl $2047,%edx + movl %edx,126(%rsp) + leaq 640(%rsp,%rax,2),%rsi + movq 8(%rsp),%rdx + movq $4,%rbp +loop_2: + movzwq 192(%rsi),%rbx + movzwq 448(%rsi),%rax + shlq $16,%rbx + shlq $16,%rax + movw 128(%rsi),%bx + movw 384(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 64(%rsi),%bx + movw 320(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 0(%rsi),%bx + movw 256(%rsi),%ax + movq %rbx,0(%rdx) + movq %rax,8(%rdx) + leaq 512(%rsi),%rsi + leaq 16(%rdx),%rdx + subq $1,%rbp + jnz loop_2 + movq $505,48(%rsp) + + movq 8(%rsp),%rcx + movq %rcx,136(%rsp) + movq 0(%rcx),%r10 + movq 8(%rcx),%r11 + movq 16(%rcx),%r12 + movq 24(%rcx),%r13 + movq 32(%rcx),%r14 + movq 40(%rcx),%r15 + movq 48(%rcx),%r8 + movq 56(%rcx),%r9 + jmp sqr_2 + +main_loop_a3b: + call sqr_reduce + call sqr_reduce + call sqr_reduce +sqr_2: + call sqr_reduce + call sqr_reduce + + + + movq 48(%rsp),%rcx + movq %rcx,%rax + shrq $4,%rax + movl 64(%rsp,%rax,2),%edx + andq $15,%rcx + shrq %cl,%rdx + andq $31,%rdx + + leaq 640(%rsp,%rdx,2),%rsi + leaq 448(%rsp),%rdx + movq %rdx,%rdi + movq $4,%rbp +loop_3: + movzwq 192(%rsi),%rbx + movzwq 448(%rsi),%rax + shlq $16,%rbx + shlq $16,%rax + movw 128(%rsi),%bx + movw 384(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 64(%rsi),%bx + movw 320(%rsi),%ax + shlq $16,%rbx + shlq $16,%rax + movw 0(%rsi),%bx + movw 256(%rsi),%ax + movq %rbx,0(%rdx) + movq %rax,8(%rdx) + leaq 512(%rsi),%rsi + leaq 16(%rdx),%rdx + subq $1,%rbp + jnz loop_3 + movq 8(%rsp),%rsi + call mont_mul_a3b + + + + movq 48(%rsp),%rcx + subq $5,%rcx + movq %rcx,48(%rsp) + jge main_loop_a3b + + + +end_main_loop_a3b: + + + movq 8(%rsp),%rdx + pxor %xmm4,%xmm4 + movdqu 0(%rdx),%xmm0 + movdqu 16(%rdx),%xmm1 + movdqu 32(%rdx),%xmm2 + movdqu 48(%rdx),%xmm3 + movdqa %xmm4,576(%rsp) + movdqa %xmm4,592(%rsp) + movdqa %xmm4,608(%rsp) + movdqa %xmm4,624(%rsp) + movdqa %xmm0,512(%rsp) + movdqa %xmm1,528(%rsp) + movdqa %xmm2,544(%rsp) + movdqa %xmm3,560(%rsp) + call mont_reduce + + + + movq 8(%rsp),%rax + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%r12 + movq 40(%rax),%r13 + movq 48(%rax),%r14 + movq 56(%rax),%r15 + + + movq 24(%rsp),%rbx + addq $512,%rbx + + subq 0(%rbx),%r8 + sbbq 8(%rbx),%r9 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + sbbq 32(%rbx),%r12 + sbbq 40(%rbx),%r13 + sbbq 48(%rbx),%r14 + sbbq 56(%rbx),%r15 + + + movq 0(%rax),%rsi + movq 8(%rax),%rdi + movq 16(%rax),%rcx + movq 24(%rax),%rdx + cmovncq %r8,%rsi + cmovncq %r9,%rdi + cmovncq %r10,%rcx + cmovncq %r11,%rdx + movq %rsi,0(%rax) + movq %rdi,8(%rax) + movq %rcx,16(%rax) + movq %rdx,24(%rax) + + movq 32(%rax),%rsi + movq 40(%rax),%rdi + movq 48(%rax),%rcx + movq 56(%rax),%rdx + cmovncq %r12,%rsi + cmovncq %r13,%rdi + cmovncq %r14,%rcx + cmovncq %r15,%rdx + movq %rsi,32(%rax) + movq %rdi,40(%rax) + movq %rcx,48(%rax) + movq %rdx,56(%rax) + + movq 0(%rsp),%rsi + movq 0(%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbx + movq 40(%rsi),%rbp + leaq 48(%rsi),%rsp +L$epilogue: + .byte 0xf3,0xc3 + +#endif diff --git a/mac-x86_64/crypto/cpu-x86_64-asm.S b/mac-x86_64/crypto/cpu-x86_64-asm.S new file mode 100644 index 0000000..0dde04d --- /dev/null +++ b/mac-x86_64/crypto/cpu-x86_64-asm.S @@ -0,0 +1,143 @@ +#if defined(__x86_64__) +.text + +.globl _OPENSSL_ia32_cpuid +.private_extern _OPENSSL_ia32_cpuid + +.p2align 4 +_OPENSSL_ia32_cpuid: + + + movq %rdi,%rdi + movq %rbx,%r8 + + xorl %eax,%eax + movl %eax,8(%rdi) + cpuid + movl %eax,%r11d + + xorl %eax,%eax + cmpl $1970169159,%ebx + setne %al + movl %eax,%r9d + cmpl $1231384169,%edx + setne %al + orl %eax,%r9d + cmpl $1818588270,%ecx + setne %al + orl %eax,%r9d + jz L$intel + + cmpl $1752462657,%ebx + setne %al + movl %eax,%r10d + cmpl $1769238117,%edx + setne %al + orl %eax,%r10d + cmpl $1145913699,%ecx + setne %al + orl %eax,%r10d + jnz L$intel + + + + + movl $2147483648,%eax + cpuid + + + cmpl $2147483649,%eax + jb L$intel + movl %eax,%r10d + movl $2147483649,%eax + cpuid + + + orl %ecx,%r9d + andl $2049,%r9d + + cmpl $2147483656,%r10d + jb L$intel + + movl $2147483656,%eax + cpuid + + movzbq %cl,%r10 + incq %r10 + + movl $1,%eax + cpuid + + btl $28,%edx + jnc L$generic + shrl $16,%ebx + cmpb %r10b,%bl + ja L$generic + andl $4026531839,%edx + jmp L$generic + +L$intel: + cmpl $4,%r11d + movl $-1,%r10d + jb L$nocacheinfo + + movl $4,%eax + movl $0,%ecx + cpuid + movl %eax,%r10d + shrl $14,%r10d + andl $4095,%r10d + + cmpl $7,%r11d + jb L$nocacheinfo + + movl $7,%eax + xorl %ecx,%ecx + cpuid + movl %ebx,8(%rdi) + +L$nocacheinfo: + movl $1,%eax + cpuid + + andl $3220176895,%edx + cmpl $0,%r9d + jne L$notintel + orl $1073741824,%edx +L$notintel: + btl $28,%edx + jnc L$generic + andl $4026531839,%edx + cmpl $0,%r10d + je L$generic + + orl $268435456,%edx + shrl $16,%ebx + cmpb $1,%bl + ja L$generic + andl $4026531839,%edx +L$generic: + andl $2048,%r9d + andl $4294965247,%ecx + orl %ecx,%r9d + + movl %edx,%r10d + btl $27,%r9d + jnc L$clear_avx + xorl %ecx,%ecx +.byte 0x0f,0x01,0xd0 + andl $6,%eax + cmpl $6,%eax + je L$done +L$clear_avx: + movl $4026525695,%eax + andl %eax,%r9d + andl $4294967263,8(%rdi) +L$done: + movl %r9d,4(%rdi) + movl %r10d,0(%rdi) + movq %r8,%rbx + .byte 0xf3,0xc3 + + +#endif |