diff options
author | Adam Langley <agl@google.com> | 2015-05-11 17:20:37 -0700 |
---|---|---|
committer | Kenny Root <kroot@google.com> | 2015-05-12 23:06:14 +0000 |
commit | e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5 (patch) | |
tree | 6e43e34595ecf887c26c32b86d8ab097fe8cac64 /linux-x86 | |
parent | b3106a0cc1493bbe0505c0ec0ce3da4ca90a29ae (diff) | |
download | external_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.zip external_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.tar.gz external_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.tar.bz2 |
external/boringssl: bump revision.
This change bumps the BoringSSL revision to the current tip-of-tree.
Change-Id: I91d5bf467e16e8d86cb19a4de873985f524e5faa
Diffstat (limited to 'linux-x86')
-rw-r--r-- | linux-x86/crypto/aes/aesni-x86.S | 789 | ||||
-rw-r--r-- | linux-x86/crypto/bn/bn-586.S | 437 | ||||
-rw-r--r-- | linux-x86/crypto/bn/x86-mont.S | 172 | ||||
-rw-r--r-- | linux-x86/crypto/cpu-x86-asm.S | 28 | ||||
-rw-r--r-- | linux-x86/crypto/sha/sha1-586.S | 1422 | ||||
-rw-r--r-- | linux-x86/crypto/sha/sha256-586.S | 1643 | ||||
-rw-r--r-- | linux-x86/crypto/sha/sha512-586.S | 2271 |
7 files changed, 6097 insertions, 665 deletions
diff --git a/linux-x86/crypto/aes/aesni-x86.S b/linux-x86/crypto/aes/aesni-x86.S index 5aee116..aec110d 100644 --- a/linux-x86/crypto/aes/aesni-x86.S +++ b/linux-x86/crypto/aes/aesni-x86.S @@ -23,7 +23,10 @@ aesni_encrypt: leal 16(%edx),%edx jnz .L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_encrypt,.-.L_aesni_encrypt_begin .globl aesni_decrypt @@ -48,7 +51,10 @@ aesni_decrypt: leal 16(%edx),%edx jnz .L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_decrypt,.-.L_aesni_decrypt_begin .hidden _aesni_encrypt2 @@ -269,17 +275,15 @@ _aesni_encrypt6: negl %ecx .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp .L_aesni_encrypt6_enter + jmp .L008_aesni_encrypt6_inner .align 16 -.L008enc6_loop: +.L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +.L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -293,7 +297,7 @@ _aesni_encrypt6: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L008enc6_loop + jnz .L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -326,17 +330,15 @@ _aesni_decrypt6: negl %ecx .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp .L_aesni_decrypt6_enter + jmp .L010_aesni_decrypt6_inner .align 16 -.L009dec6_loop: +.L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +.L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -350,7 +352,7 @@ _aesni_decrypt6: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L009dec6_loop + jnz .L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -381,14 +383,14 @@ aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L010ecb_ret + jz .L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L011ecb_decrypt + jz .L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L012ecb_enc_tail + jb .L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -397,9 +399,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L013ecb_enc_loop6_enter + jmp .L015ecb_enc_loop6_enter .align 16 -.L014ecb_enc_loop6: +.L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -414,12 +416,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L013ecb_enc_loop6_enter: +.L015ecb_enc_loop6_enter: call _aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L014ecb_enc_loop6 + jnc .L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -428,18 +430,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L010ecb_ret -.L012ecb_enc_tail: + jz .L012ecb_ret +.L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L015ecb_enc_one + jb .L017ecb_enc_one movups 16(%esi),%xmm3 - je .L016ecb_enc_two + je .L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L017ecb_enc_three + jb .L019ecb_enc_three movups 48(%esi),%xmm5 - je .L018ecb_enc_four + je .L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_encrypt6 @@ -448,49 +450,49 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L015ecb_enc_one: +.L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L019enc1_loop_3: +.L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L019enc1_loop_3 + jnz .L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L016ecb_enc_two: +.L018ecb_enc_two: call _aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L017ecb_enc_three: +.L019ecb_enc_three: call _aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L018ecb_enc_four: +.L020ecb_enc_four: call _aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L011ecb_decrypt: +.L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L020ecb_dec_tail + jb .L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -499,9 +501,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L021ecb_dec_loop6_enter + jmp .L023ecb_dec_loop6_enter .align 16 -.L022ecb_dec_loop6: +.L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -516,12 +518,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L021ecb_dec_loop6_enter: +.L023ecb_dec_loop6_enter: call _aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L022ecb_dec_loop6 + jnc .L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -530,18 +532,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L010ecb_ret -.L020ecb_dec_tail: + jz .L012ecb_ret +.L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L023ecb_dec_one + jb .L025ecb_dec_one movups 16(%esi),%xmm3 - je .L024ecb_dec_two + je .L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L025ecb_dec_three + jb .L027ecb_dec_three movups 48(%esi),%xmm5 - je .L026ecb_dec_four + je .L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_decrypt6 @@ -550,43 +552,51 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L023ecb_dec_one: +.L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L027dec1_loop_4: +.L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L027dec1_loop_4 + jnz .L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L024ecb_dec_two: +.L026ecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L025ecb_dec_three: +.L027ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L026ecb_dec_four: +.L028ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L010ecb_ret: +.L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -634,7 +644,7 @@ aesni_ccm64_encrypt_blocks: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -.L028ccm64_enc_outer: +.L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -643,7 +653,7 @@ aesni_ccm64_encrypt_blocks: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -.L029ccm64_enc2_loop: +.L031ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -651,7 +661,7 @@ aesni_ccm64_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L029ccm64_enc2_loop + jnz .L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -664,10 +674,18 @@ aesni_ccm64_encrypt_blocks: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz .L028ccm64_enc_outer + jnz .L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -716,12 +734,12 @@ aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L030enc1_loop_5: +.L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L030enc1_loop_5 + jnz .L032enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -731,16 +749,16 @@ aesni_ccm64_decrypt_blocks: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp .L031ccm64_dec_outer + jmp .L033ccm64_dec_outer .align 16 -.L031ccm64_dec_outer: +.L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L032ccm64_dec_break + jz .L034ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -748,7 +766,7 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -.L033ccm64_dec2_loop: +.L035ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -756,7 +774,7 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L033ccm64_dec2_loop + jnz .L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -764,9 +782,9 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp .L031ccm64_dec_outer + jmp .L033ccm64_dec_outer .align 16 -.L032ccm64_dec_break: +.L034ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -774,16 +792,24 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L034enc1_loop_6: +.L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L034enc1_loop_6 + jnz .L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -810,7 +836,7 @@ aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L035ctr32_one_shortcut + je .L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -848,7 +874,7 @@ aesni_ctr32_encrypt_blocks: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L036ctr32_tail + jb .L038ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -857,9 +883,9 @@ aesni_ctr32_encrypt_blocks: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L037ctr32_loop6 + jmp .L039ctr32_loop6 .align 16 -.L037ctr32_loop6: +.L039ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -913,27 +939,27 @@ aesni_ctr32_encrypt_blocks: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L037ctr32_loop6 + jnc .L039ctr32_loop6 addl $6,%eax - jz .L038ctr32_ret + jz .L040ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -.L036ctr32_tail: +.L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L039ctr32_one + jb .L041ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L040ctr32_two + je .L042ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L041ctr32_three + jb .L043ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L042ctr32_four + je .L044ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 @@ -951,29 +977,29 @@ aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L035ctr32_one_shortcut: +.L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L039ctr32_one: +.L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L043enc1_loop_7: +.L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L043enc1_loop_7 + jnz .L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L040ctr32_two: +.L042ctr32_two: call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -981,9 +1007,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L041ctr32_three: +.L043ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -994,9 +1020,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L042ctr32_four: +.L044ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -1010,7 +1036,18 @@ aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L038ctr32_ret: +.L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -1036,12 +1073,12 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L044enc1_loop_8: +.L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L044enc1_loop_8 + jnz .L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1065,14 +1102,14 @@ aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L045xts_enc_short + jc .L047xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L046xts_enc_loop6 + jmp .L048xts_enc_loop6 .align 16 -.L046xts_enc_loop6: +.L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1161,23 +1198,23 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L046xts_enc_loop6 + jnc .L048xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L045xts_enc_short: +.L047xts_enc_short: addl $96,%eax - jz .L047xts_enc_done6x + jz .L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L048xts_enc_one + jb .L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L049xts_enc_two + je .L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1186,7 +1223,7 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L050xts_enc_three + jb .L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1196,7 +1233,7 @@ aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L051xts_enc_four + je .L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1228,9 +1265,9 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L048xts_enc_one: +.L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1238,20 +1275,20 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L053enc1_loop_9: +.L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L053enc1_loop_9 + jnz .L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L049xts_enc_two: +.L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1265,9 +1302,9 @@ aesni_xts_encrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L050xts_enc_three: +.L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1285,9 +1322,9 @@ aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L051xts_enc_four: +.L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1309,28 +1346,28 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L047xts_enc_done6x: +.L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L054xts_enc_ret + jz .L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L055xts_enc_steal + jmp .L057xts_enc_steal .align 16 -.L052xts_enc_done: +.L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L054xts_enc_ret + jz .L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L055xts_enc_steal: +.L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1338,7 +1375,7 @@ aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L055xts_enc_steal + jnz .L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1348,16 +1385,30 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L056enc1_loop_10: +.L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L056enc1_loop_10 + jnz .L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L054xts_enc_ret: +.L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1383,12 +1434,12 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L057enc1_loop_11: +.L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L057enc1_loop_11 + jnz .L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1417,14 +1468,14 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L058xts_dec_short + jc .L060xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L059xts_dec_loop6 + jmp .L061xts_dec_loop6 .align 16 -.L059xts_dec_loop6: +.L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1513,23 +1564,23 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L059xts_dec_loop6 + jnc .L061xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L058xts_dec_short: +.L060xts_dec_short: addl $96,%eax - jz .L060xts_dec_done6x + jz .L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L061xts_dec_one + jb .L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L062xts_dec_two + je .L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1538,7 +1589,7 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L063xts_dec_three + jb .L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1548,7 +1599,7 @@ aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L064xts_dec_four + je .L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1580,9 +1631,9 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L061xts_dec_one: +.L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1590,20 +1641,20 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L066dec1_loop_12: +.L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L066dec1_loop_12 + jnz .L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L062xts_dec_two: +.L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1617,9 +1668,9 @@ aesni_xts_decrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L063xts_dec_three: +.L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1637,9 +1688,9 @@ aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L064xts_dec_four: +.L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1661,20 +1712,20 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L060xts_dec_done6x: +.L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L067xts_dec_ret + jz .L069xts_dec_ret movl %eax,112(%esp) - jmp .L068xts_dec_only_one_more + jmp .L070xts_dec_only_one_more .align 16 -.L065xts_dec_done: +.L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L067xts_dec_ret + jz .L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1684,7 +1735,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L068xts_dec_only_one_more: +.L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1698,16 +1749,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L069dec1_loop_13: +.L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L069dec1_loop_13 + jnz .L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L070xts_dec_steal: +.L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1715,7 +1766,7 @@ aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L070xts_dec_steal + jnz .L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1725,16 +1776,30 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L071dec1_loop_14: +.L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L071dec1_loop_14 + jnz .L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L067xts_dec_ret: +.L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1761,7 +1826,7 @@ aesni_cbc_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz .L072cbc_abort + jz .L074cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1769,14 +1834,14 @@ aesni_cbc_encrypt: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je .L073cbc_decrypt + je .L075cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb .L074cbc_enc_tail + jb .L076cbc_enc_tail subl $16,%eax - jmp .L075cbc_enc_loop + jmp .L077cbc_enc_loop .align 16 -.L075cbc_enc_loop: +.L077cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1784,24 +1849,25 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -.L076enc1_loop_15: +.L078enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L076enc1_loop_15 + jnz .L078enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc .L075cbc_enc_loop + jnc .L077cbc_enc_loop addl $16,%eax - jnz .L074cbc_enc_tail + jnz .L076cbc_enc_tail movaps %xmm2,%xmm7 - jmp .L077cbc_ret -.L074cbc_enc_tail: + pxor %xmm2,%xmm2 + jmp .L079cbc_ret +.L076cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1812,20 +1878,20 @@ aesni_cbc_encrypt: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp .L075cbc_enc_loop + jmp .L077cbc_enc_loop .align 16 -.L073cbc_decrypt: +.L075cbc_decrypt: cmpl $80,%eax - jbe .L078cbc_dec_tail + jbe .L080cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp .L079cbc_dec_loop6_enter + jmp .L081cbc_dec_loop6_enter .align 16 -.L080cbc_dec_loop6: +.L082cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -.L079cbc_dec_loop6_enter: +.L081cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1855,28 +1921,28 @@ aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L080cbc_dec_loop6 + ja .L082cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L081cbc_dec_tail_collected + jle .L083cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L078cbc_dec_tail: +.L080cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L082cbc_dec_one + jbe .L084cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L083cbc_dec_two + jbe .L085cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L084cbc_dec_three + jbe .L086cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L085cbc_dec_four + jbe .L087cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1894,55 +1960,62 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L082cbc_dec_one: +.L084cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L086dec1_loop_16: +.L089dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L086dec1_loop_16 + jnz .L089dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L083cbc_dec_two: +.L085cbc_dec_two: call _aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L084cbc_dec_three: +.L086cbc_dec_three: call _aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L085cbc_dec_four: +.L087cbc_dec_four: call _aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1952,28 +2025,44 @@ aesni_cbc_encrypt: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -.L081cbc_dec_tail_collected: + jmp .L088cbc_dec_tail_collected +.align 16 +.L083cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +.L088cbc_dec_tail_collected: andl $15,%eax - jnz .L087cbc_dec_tail_partial + jnz .L090cbc_dec_tail_partial movups %xmm2,(%edi) - jmp .L077cbc_ret + pxor %xmm0,%xmm0 + jmp .L079cbc_ret .align 16 -.L087cbc_dec_tail_partial: +.L090cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -.L077cbc_ret: + movdqa %xmm2,(%esp) +.L079cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -.L072cbc_abort: + pxor %xmm7,%xmm7 +.L074cbc_abort: popl %edi popl %esi popl %ebx @@ -1984,52 +2073,62 @@ aesni_cbc_encrypt: .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz .L088bad_pointer + jz .L091bad_pointer testl %edx,%edx - jz .L088bad_pointer + jz .L091bad_pointer + call .L092pic +.L092pic: + popl %ebx + leal .Lkey_const-.L092pic(%ebx),%ebx + leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je .L08914rounds + je .L09314rounds cmpl $192,%ecx - je .L09012rounds + je .L09412rounds cmpl $128,%ecx - jne .L091bad_keybits + jne .L095bad_keybits .align 16 -.L09210rounds: +.L09610rounds: + cmpl $268435456,%ebp + je .L09710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L093key_128_cold + call .L098key_128_cold .byte 102,15,58,223,200,2 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,4 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,8 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,16 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,32 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,64 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,128 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,27 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,54 - call .L094key_128 + call .L099key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L094key_128: +.L099key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L093key_128_cold: +.L098key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2038,38 +2137,91 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L09012rounds: +.L09710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L101loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L101loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L100good_key +.align 16 +.L09412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je .L10212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L095key_192a_cold + call .L103key_192a_cold .byte 102,15,58,223,202,2 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,4 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,8 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,16 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,32 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,64 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,128 - call .L096key_192b + call .L104key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L097key_192a: +.L105key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L095key_192a_cold: +.L103key_192a_cold: movaps %xmm2,%xmm5 -.L098key_192b_warm: +.L106key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2083,56 +2235,90 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L096key_192b: +.L104key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L098key_192b_warm + jmp .L106key_192b_warm +.align 16 +.L10212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +.L107loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz .L107loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp .L100good_key .align 16 -.L08914rounds: +.L09314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je .L10814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L099key_256a_cold + call .L109key_256a_cold .byte 102,15,58,223,200,1 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,2 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,2 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,4 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,4 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,8 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,8 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,16 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,16 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,32 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,32 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,64 - call .L101key_256a + call .L111key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L101key_256a: +.L111key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L099key_256a_cold: +.L109key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2141,7 +2327,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L100key_256b: +.L110key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2151,13 +2337,70 @@ _aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 16 +.L10814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +.L112loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz .L113done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp .L112loop_key256 +.L113done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +.L100good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 4 -.L088bad_pointer: +.L091bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 4 -.L091bad_keybits: +.L095bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key .globl aesni_set_encrypt_key @@ -2185,7 +2428,7 @@ aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L102dec_key_ret + jnz .L114dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2193,7 +2436,7 @@ aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L103dec_key_inverse: +.L115dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2203,14 +2446,22 @@ aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L103dec_key_inverse + ja .L115dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -.L102dec_key_ret: +.L114dec_key_ret: ret .size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin +.align 64 +.Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 diff --git a/linux-x86/crypto/bn/bn-586.S b/linux-x86/crypto/bn/bn-586.S index b953393..773beff 100644 --- a/linux-x86/crypto/bn/bn-586.S +++ b/linux-x86/crypto/bn/bn-586.S @@ -7,6 +7,102 @@ .align 16 bn_mul_add_words: .L_bn_mul_add_words_begin: + call .L000PIC_me_up +.L000PIC_me_up: + popl %eax + leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc .L001maw_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 + jmp .L002maw_sse2_entry +.align 16 +.L003maw_sse2_unrolled: + movd (%eax),%mm3 + paddq %mm3,%mm1 + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + movd 4(%edx),%mm4 + pmuludq %mm0,%mm4 + movd 8(%edx),%mm6 + pmuludq %mm0,%mm6 + movd 12(%edx),%mm7 + pmuludq %mm0,%mm7 + paddq %mm2,%mm1 + movd 4(%eax),%mm3 + paddq %mm4,%mm3 + movd 8(%eax),%mm5 + paddq %mm6,%mm5 + movd 12(%eax),%mm4 + paddq %mm4,%mm7 + movd %mm1,(%eax) + movd 16(%edx),%mm2 + pmuludq %mm0,%mm2 + psrlq $32,%mm1 + movd 20(%edx),%mm4 + pmuludq %mm0,%mm4 + paddq %mm3,%mm1 + movd 24(%edx),%mm6 + pmuludq %mm0,%mm6 + movd %mm1,4(%eax) + psrlq $32,%mm1 + movd 28(%edx),%mm3 + addl $32,%edx + pmuludq %mm0,%mm3 + paddq %mm5,%mm1 + movd 16(%eax),%mm5 + paddq %mm5,%mm2 + movd %mm1,8(%eax) + psrlq $32,%mm1 + paddq %mm7,%mm1 + movd 20(%eax),%mm5 + paddq %mm5,%mm4 + movd %mm1,12(%eax) + psrlq $32,%mm1 + paddq %mm2,%mm1 + movd 24(%eax),%mm5 + paddq %mm5,%mm6 + movd %mm1,16(%eax) + psrlq $32,%mm1 + paddq %mm4,%mm1 + movd 28(%eax),%mm5 + paddq %mm5,%mm3 + movd %mm1,20(%eax) + psrlq $32,%mm1 + paddq %mm6,%mm1 + movd %mm1,24(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,28(%eax) + leal 32(%eax),%eax + psrlq $32,%mm1 + subl $8,%ecx + jz .L004maw_sse2_exit +.L002maw_sse2_entry: + testl $4294967288,%ecx + jnz .L003maw_sse2_unrolled +.align 4 +.L005maw_sse2_loop: + movd (%edx),%mm2 + movd (%eax),%mm3 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm3,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz .L005maw_sse2_loop +.L004maw_sse2_exit: + movd %mm1,%eax + emms + ret +.align 16 +.L001maw_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -19,9 +115,9 @@ bn_mul_add_words: andl $4294967288,%ecx movl 32(%esp),%ebp pushl %ecx - jz .L000maw_finish + jz .L006maw_finish .align 16 -.L001maw_loop: +.L007maw_loop: movl (%ebx),%eax mull %ebp @@ -98,13 +194,13 @@ bn_mul_add_words: subl $8,%ecx leal 32(%ebx),%ebx leal 32(%edi),%edi - jnz .L001maw_loop -.L000maw_finish: + jnz .L007maw_loop +.L006maw_finish: movl 32(%esp),%ecx andl $7,%ecx - jnz .L002maw_finish2 - jmp .L003maw_end -.L002maw_finish2: + jnz .L008maw_finish2 + jmp .L009maw_end +.L008maw_finish2: movl (%ebx),%eax mull %ebp @@ -115,7 +211,7 @@ bn_mul_add_words: decl %ecx movl %eax,(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 4(%ebx),%eax mull %ebp @@ -126,7 +222,7 @@ bn_mul_add_words: decl %ecx movl %eax,4(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 8(%ebx),%eax mull %ebp @@ -137,7 +233,7 @@ bn_mul_add_words: decl %ecx movl %eax,8(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 12(%ebx),%eax mull %ebp @@ -148,7 +244,7 @@ bn_mul_add_words: decl %ecx movl %eax,12(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 16(%ebx),%eax mull %ebp @@ -159,7 +255,7 @@ bn_mul_add_words: decl %ecx movl %eax,16(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 20(%ebx),%eax mull %ebp @@ -170,7 +266,7 @@ bn_mul_add_words: decl %ecx movl %eax,20(%edi) movl %edx,%esi - jz .L003maw_end + jz .L009maw_end movl 24(%ebx),%eax mull %ebp @@ -180,7 +276,7 @@ bn_mul_add_words: adcl $0,%edx movl %eax,24(%edi) movl %edx,%esi -.L003maw_end: +.L009maw_end: movl %esi,%eax popl %ecx popl %edi @@ -195,6 +291,33 @@ bn_mul_add_words: .align 16 bn_mul_words: .L_bn_mul_words_begin: + call .L010PIC_me_up +.L010PIC_me_up: + popl %eax + leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc .L011mw_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 +.align 16 +.L012mw_sse2_loop: + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz .L012mw_sse2_loop + movd %mm1,%eax + emms + ret +.align 16 +.L011mw_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -206,8 +329,8 @@ bn_mul_words: movl 28(%esp),%ebp movl 32(%esp),%ecx andl $4294967288,%ebp - jz .L004mw_finish -.L005mw_loop: + jz .L013mw_finish +.L014mw_loop: movl (%ebx),%eax mull %ecx @@ -268,14 +391,14 @@ bn_mul_words: addl $32,%ebx addl $32,%edi subl $8,%ebp - jz .L004mw_finish - jmp .L005mw_loop -.L004mw_finish: + jz .L013mw_finish + jmp .L014mw_loop +.L013mw_finish: movl 28(%esp),%ebp andl $7,%ebp - jnz .L006mw_finish2 - jmp .L007mw_end -.L006mw_finish2: + jnz .L015mw_finish2 + jmp .L016mw_end +.L015mw_finish2: movl (%ebx),%eax mull %ecx @@ -284,7 +407,7 @@ bn_mul_words: movl %eax,(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 4(%ebx),%eax mull %ecx @@ -293,7 +416,7 @@ bn_mul_words: movl %eax,4(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 8(%ebx),%eax mull %ecx @@ -302,7 +425,7 @@ bn_mul_words: movl %eax,8(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 12(%ebx),%eax mull %ecx @@ -311,7 +434,7 @@ bn_mul_words: movl %eax,12(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 16(%ebx),%eax mull %ecx @@ -320,7 +443,7 @@ bn_mul_words: movl %eax,16(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 20(%ebx),%eax mull %ecx @@ -329,7 +452,7 @@ bn_mul_words: movl %eax,20(%edi) movl %edx,%esi decl %ebp - jz .L007mw_end + jz .L016mw_end movl 24(%ebx),%eax mull %ecx @@ -337,7 +460,7 @@ bn_mul_words: adcl $0,%edx movl %eax,24(%edi) movl %edx,%esi -.L007mw_end: +.L016mw_end: movl %esi,%eax popl %edi popl %esi @@ -351,6 +474,28 @@ bn_mul_words: .align 16 bn_sqr_words: .L_bn_sqr_words_begin: + call .L017PIC_me_up +.L017PIC_me_up: + popl %eax + leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc .L018sqr_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx +.align 16 +.L019sqr_sse2_loop: + movd (%edx),%mm0 + pmuludq %mm0,%mm0 + leal 4(%edx),%edx + movq %mm0,(%eax) + subl $1,%ecx + leal 8(%eax),%eax + jnz .L019sqr_sse2_loop + emms + ret +.align 16 +.L018sqr_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -360,8 +505,8 @@ bn_sqr_words: movl 24(%esp),%edi movl 28(%esp),%ebx andl $4294967288,%ebx - jz .L008sw_finish -.L009sw_loop: + jz .L020sw_finish +.L021sw_loop: movl (%edi),%eax mull %eax @@ -406,59 +551,59 @@ bn_sqr_words: addl $32,%edi addl $64,%esi subl $8,%ebx - jnz .L009sw_loop -.L008sw_finish: + jnz .L021sw_loop +.L020sw_finish: movl 28(%esp),%ebx andl $7,%ebx - jz .L010sw_end + jz .L022sw_end movl (%edi),%eax mull %eax movl %eax,(%esi) decl %ebx movl %edx,4(%esi) - jz .L010sw_end + jz .L022sw_end movl 4(%edi),%eax mull %eax movl %eax,8(%esi) decl %ebx movl %edx,12(%esi) - jz .L010sw_end + jz .L022sw_end movl 8(%edi),%eax mull %eax movl %eax,16(%esi) decl %ebx movl %edx,20(%esi) - jz .L010sw_end + jz .L022sw_end movl 12(%edi),%eax mull %eax movl %eax,24(%esi) decl %ebx movl %edx,28(%esi) - jz .L010sw_end + jz .L022sw_end movl 16(%edi),%eax mull %eax movl %eax,32(%esi) decl %ebx movl %edx,36(%esi) - jz .L010sw_end + jz .L022sw_end movl 20(%edi),%eax mull %eax movl %eax,40(%esi) decl %ebx movl %edx,44(%esi) - jz .L010sw_end + jz .L022sw_end movl 24(%edi),%eax mull %eax movl %eax,48(%esi) movl %edx,52(%esi) -.L010sw_end: +.L022sw_end: popl %edi popl %esi popl %ebx @@ -494,8 +639,8 @@ bn_add_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L011aw_finish -.L012aw_loop: + jz .L023aw_finish +.L024aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -573,11 +718,11 @@ bn_add_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L012aw_loop -.L011aw_finish: + jnz .L024aw_loop +.L023aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L013aw_end + jz .L025aw_end movl (%esi),%ecx movl (%edi),%edx @@ -588,7 +733,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L013aw_end + jz .L025aw_end movl 4(%esi),%ecx movl 4(%edi),%edx @@ -599,7 +744,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L013aw_end + jz .L025aw_end movl 8(%esi),%ecx movl 8(%edi),%edx @@ -610,7 +755,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L013aw_end + jz .L025aw_end movl 12(%esi),%ecx movl 12(%edi),%edx @@ -621,7 +766,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L013aw_end + jz .L025aw_end movl 16(%esi),%ecx movl 16(%edi),%edx @@ -632,7 +777,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L013aw_end + jz .L025aw_end movl 20(%esi),%ecx movl 20(%edi),%edx @@ -643,7 +788,7 @@ bn_add_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L013aw_end + jz .L025aw_end movl 24(%esi),%ecx movl 24(%edi),%edx @@ -653,7 +798,7 @@ bn_add_words: addl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -.L013aw_end: +.L025aw_end: popl %edi popl %esi popl %ebx @@ -677,8 +822,8 @@ bn_sub_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L014aw_finish -.L015aw_loop: + jz .L026aw_finish +.L027aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -756,11 +901,11 @@ bn_sub_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L015aw_loop -.L014aw_finish: + jnz .L027aw_loop +.L026aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L016aw_end + jz .L028aw_end movl (%esi),%ecx movl (%edi),%edx @@ -771,7 +916,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L016aw_end + jz .L028aw_end movl 4(%esi),%ecx movl 4(%edi),%edx @@ -782,7 +927,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L016aw_end + jz .L028aw_end movl 8(%esi),%ecx movl 8(%edi),%edx @@ -793,7 +938,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L016aw_end + jz .L028aw_end movl 12(%esi),%ecx movl 12(%edi),%edx @@ -804,7 +949,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L016aw_end + jz .L028aw_end movl 16(%esi),%ecx movl 16(%edi),%edx @@ -815,7 +960,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L016aw_end + jz .L028aw_end movl 20(%esi),%ecx movl 20(%edi),%edx @@ -826,7 +971,7 @@ bn_sub_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L016aw_end + jz .L028aw_end movl 24(%esi),%ecx movl 24(%edi),%edx @@ -836,7 +981,7 @@ bn_sub_words: subl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -.L016aw_end: +.L028aw_end: popl %edi popl %esi popl %ebx @@ -860,8 +1005,8 @@ bn_sub_part_words: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz .L017aw_finish -.L018aw_loop: + jz .L029aw_finish +.L030aw_loop: movl (%esi),%ecx movl (%edi),%edx @@ -939,11 +1084,11 @@ bn_sub_part_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L018aw_loop -.L017aw_finish: + jnz .L030aw_loop +.L029aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -957,7 +1102,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -971,7 +1116,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -985,7 +1130,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -999,7 +1144,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -1013,7 +1158,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -1027,7 +1172,7 @@ bn_sub_part_words: addl $4,%edi addl $4,%ebx decl %ebp - jz .L019aw_end + jz .L031aw_end movl (%esi),%ecx movl (%edi),%edx @@ -1040,20 +1185,20 @@ bn_sub_part_words: addl $4,%esi addl $4,%edi addl $4,%ebx -.L019aw_end: +.L031aw_end: cmpl $0,36(%esp) - je .L020pw_end + je .L032pw_end movl 36(%esp),%ebp cmpl $0,%ebp - je .L020pw_end - jge .L021pw_pos + je .L032pw_end + jge .L033pw_pos movl $0,%edx subl %ebp,%edx movl %edx,%ebp andl $4294967288,%ebp - jz .L022pw_neg_finish -.L023pw_neg_loop: + jz .L034pw_neg_finish +.L035pw_neg_loop: movl $0,%ecx movl (%edi),%edx @@ -1130,13 +1275,13 @@ bn_sub_part_words: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz .L023pw_neg_loop -.L022pw_neg_finish: + jnz .L035pw_neg_loop +.L034pw_neg_finish: movl 36(%esp),%edx movl $0,%ebp subl %edx,%ebp andl $7,%ebp - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl (%edi),%edx @@ -1147,7 +1292,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 4(%edi),%edx @@ -1158,7 +1303,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 8(%edi),%edx @@ -1169,7 +1314,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 12(%edi),%edx @@ -1180,7 +1325,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 16(%edi),%edx @@ -1191,7 +1336,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 20(%edi),%edx @@ -1202,7 +1347,7 @@ bn_sub_part_words: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz .L020pw_end + jz .L032pw_end movl $0,%ecx movl 24(%edi),%edx @@ -1212,178 +1357,178 @@ bn_sub_part_words: subl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) - jmp .L020pw_end -.L021pw_pos: + jmp .L032pw_end +.L033pw_pos: andl $4294967288,%ebp - jz .L024pw_pos_finish -.L025pw_pos_loop: + jz .L036pw_pos_finish +.L037pw_pos_loop: movl (%esi),%ecx subl %eax,%ecx movl %ecx,(%ebx) - jnc .L026pw_nc0 + jnc .L038pw_nc0 movl 4(%esi),%ecx subl %eax,%ecx movl %ecx,4(%ebx) - jnc .L027pw_nc1 + jnc .L039pw_nc1 movl 8(%esi),%ecx subl %eax,%ecx movl %ecx,8(%ebx) - jnc .L028pw_nc2 + jnc .L040pw_nc2 movl 12(%esi),%ecx subl %eax,%ecx movl %ecx,12(%ebx) - jnc .L029pw_nc3 + jnc .L041pw_nc3 movl 16(%esi),%ecx subl %eax,%ecx movl %ecx,16(%ebx) - jnc .L030pw_nc4 + jnc .L042pw_nc4 movl 20(%esi),%ecx subl %eax,%ecx movl %ecx,20(%ebx) - jnc .L031pw_nc5 + jnc .L043pw_nc5 movl 24(%esi),%ecx subl %eax,%ecx movl %ecx,24(%ebx) - jnc .L032pw_nc6 + jnc .L044pw_nc6 movl 28(%esi),%ecx subl %eax,%ecx movl %ecx,28(%ebx) - jnc .L033pw_nc7 + jnc .L045pw_nc7 addl $32,%esi addl $32,%ebx subl $8,%ebp - jnz .L025pw_pos_loop -.L024pw_pos_finish: + jnz .L037pw_pos_loop +.L036pw_pos_finish: movl 36(%esp),%ebp andl $7,%ebp - jz .L020pw_end + jz .L032pw_end movl (%esi),%ecx subl %eax,%ecx movl %ecx,(%ebx) - jnc .L034pw_tail_nc0 + jnc .L046pw_tail_nc0 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 4(%esi),%ecx subl %eax,%ecx movl %ecx,4(%ebx) - jnc .L035pw_tail_nc1 + jnc .L047pw_tail_nc1 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 8(%esi),%ecx subl %eax,%ecx movl %ecx,8(%ebx) - jnc .L036pw_tail_nc2 + jnc .L048pw_tail_nc2 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 12(%esi),%ecx subl %eax,%ecx movl %ecx,12(%ebx) - jnc .L037pw_tail_nc3 + jnc .L049pw_tail_nc3 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 16(%esi),%ecx subl %eax,%ecx movl %ecx,16(%ebx) - jnc .L038pw_tail_nc4 + jnc .L050pw_tail_nc4 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 20(%esi),%ecx subl %eax,%ecx movl %ecx,20(%ebx) - jnc .L039pw_tail_nc5 + jnc .L051pw_tail_nc5 decl %ebp - jz .L020pw_end + jz .L032pw_end movl 24(%esi),%ecx subl %eax,%ecx movl %ecx,24(%ebx) - jnc .L040pw_tail_nc6 + jnc .L052pw_tail_nc6 movl $1,%eax - jmp .L020pw_end -.L041pw_nc_loop: + jmp .L032pw_end +.L053pw_nc_loop: movl (%esi),%ecx movl %ecx,(%ebx) -.L026pw_nc0: +.L038pw_nc0: movl 4(%esi),%ecx movl %ecx,4(%ebx) -.L027pw_nc1: +.L039pw_nc1: movl 8(%esi),%ecx movl %ecx,8(%ebx) -.L028pw_nc2: +.L040pw_nc2: movl 12(%esi),%ecx movl %ecx,12(%ebx) -.L029pw_nc3: +.L041pw_nc3: movl 16(%esi),%ecx movl %ecx,16(%ebx) -.L030pw_nc4: +.L042pw_nc4: movl 20(%esi),%ecx movl %ecx,20(%ebx) -.L031pw_nc5: +.L043pw_nc5: movl 24(%esi),%ecx movl %ecx,24(%ebx) -.L032pw_nc6: +.L044pw_nc6: movl 28(%esi),%ecx movl %ecx,28(%ebx) -.L033pw_nc7: +.L045pw_nc7: addl $32,%esi addl $32,%ebx subl $8,%ebp - jnz .L041pw_nc_loop + jnz .L053pw_nc_loop movl 36(%esp),%ebp andl $7,%ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl (%esi),%ecx movl %ecx,(%ebx) -.L034pw_tail_nc0: +.L046pw_tail_nc0: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 4(%esi),%ecx movl %ecx,4(%ebx) -.L035pw_tail_nc1: +.L047pw_tail_nc1: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 8(%esi),%ecx movl %ecx,8(%ebx) -.L036pw_tail_nc2: +.L048pw_tail_nc2: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 12(%esi),%ecx movl %ecx,12(%ebx) -.L037pw_tail_nc3: +.L049pw_tail_nc3: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 16(%esi),%ecx movl %ecx,16(%ebx) -.L038pw_tail_nc4: +.L050pw_tail_nc4: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 20(%esi),%ecx movl %ecx,20(%ebx) -.L039pw_tail_nc5: +.L051pw_tail_nc5: decl %ebp - jz .L042pw_nc_end + jz .L054pw_nc_end movl 24(%esi),%ecx movl %ecx,24(%ebx) -.L040pw_tail_nc6: -.L042pw_nc_end: +.L052pw_tail_nc6: +.L054pw_nc_end: movl $0,%eax -.L020pw_end: +.L032pw_end: popl %edi popl %esi popl %ebx diff --git a/linux-x86/crypto/bn/x86-mont.S b/linux-x86/crypto/bn/x86-mont.S index e2d23b3..1569b2c 100644 --- a/linux-x86/crypto/bn/x86-mont.S +++ b/linux-x86/crypto/bn/x86-mont.S @@ -44,6 +44,126 @@ bn_mul_mont: movl %esi,20(%esp) leal -3(%edi),%ebx movl %ebp,24(%esp) + call .L001PIC_me_up +.L001PIC_me_up: + popl %eax + leal OPENSSL_ia32cap_P-.L001PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc .L002non_sse2 + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 16 +.L0031st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl .L0031st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +.L004outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +.L005inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz .L005inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle .L004outer + emms + jmp .L006common_tail +.align 16 +.L002non_sse2: movl 8(%esp),%esi leal 1(%ebx),%ebp movl 12(%esp),%edi @@ -54,12 +174,12 @@ bn_mul_mont: leal 4(%edi,%ebx,4),%eax orl %edx,%ebp movl (%edi),%edi - jz .L001bn_sqr_mont + jz .L007bn_sqr_mont movl %eax,28(%esp) movl (%esi),%eax xorl %edx,%edx .align 16 -.L002mull: +.L008mull: movl %edx,%ebp mull %edi addl %eax,%ebp @@ -68,7 +188,7 @@ bn_mul_mont: movl (%esi,%ecx,4),%eax cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L002mull + jl .L008mull movl %edx,%ebp mull %edi movl 20(%esp),%edi @@ -86,9 +206,9 @@ bn_mul_mont: movl 4(%esi),%eax adcl $0,%edx incl %ecx - jmp .L0032ndmadd + jmp .L0092ndmadd .align 16 -.L0041stmadd: +.L0101stmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -99,7 +219,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L0041stmadd + jl .L0101stmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%eax @@ -122,7 +242,7 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx .align 16 -.L0032ndmadd: +.L0092ndmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -133,7 +253,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0032ndmadd + jl .L0092ndmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -149,16 +269,16 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl 28(%esp),%ecx movl %eax,36(%esp,%ebx,4) - je .L005common_tail + je .L006common_tail movl (%ecx),%edi movl 8(%esp),%esi movl %ecx,12(%esp) xorl %ecx,%ecx xorl %edx,%edx movl (%esi),%eax - jmp .L0041stmadd + jmp .L0101stmadd .align 16 -.L001bn_sqr_mont: +.L007bn_sqr_mont: movl %ebx,(%esp) movl %ecx,12(%esp) movl %edi,%eax @@ -169,7 +289,7 @@ bn_mul_mont: andl $1,%ebx incl %ecx .align 16 -.L006sqr: +.L011sqr: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -181,7 +301,7 @@ bn_mul_mont: cmpl (%esp),%ecx movl %eax,%ebx movl %ebp,28(%esp,%ecx,4) - jl .L006sqr + jl .L011sqr movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -205,7 +325,7 @@ bn_mul_mont: movl 4(%esi),%eax movl $1,%ecx .align 16 -.L0073rdmadd: +.L0123rdmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -224,7 +344,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0073rdmadd + jl .L0123rdmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -240,7 +360,7 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl %ebx,%ecx movl %eax,36(%esp,%ebx,4) - je .L005common_tail + je .L006common_tail movl 4(%esi,%ecx,4),%edi leal 1(%ecx),%ecx movl %edi,%eax @@ -252,12 +372,12 @@ bn_mul_mont: xorl %ebp,%ebp cmpl %ebx,%ecx leal 1(%ecx),%ecx - je .L008sqrlast + je .L013sqrlast movl %edx,%ebx shrl $1,%edx andl $1,%ebx .align 16 -.L009sqradd: +.L014sqradd: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -273,13 +393,13 @@ bn_mul_mont: cmpl (%esp),%ecx movl %ebp,28(%esp,%ecx,4) movl %eax,%ebx - jle .L009sqradd + jle .L014sqradd movl %edx,%ebp addl %edx,%edx shrl $31,%ebp addl %ebx,%edx adcl $0,%ebp -.L008sqrlast: +.L013sqrlast: movl 20(%esp),%edi movl 16(%esp),%esi imull 32(%esp),%edi @@ -294,9 +414,9 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx movl 4(%esi),%eax - jmp .L0073rdmadd + jmp .L0123rdmadd .align 16 -.L005common_tail: +.L006common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -304,16 +424,16 @@ bn_mul_mont: movl %ebx,%ecx xorl %edx,%edx .align 16 -.L010sub: +.L015sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge .L010sub + jge .L015sub sbbl $0,%eax .align 16 -.L011copy: +.L016copy: movl (%esi,%ebx,4),%edx movl (%edi,%ebx,4),%ebp xorl %ebp,%edx @@ -322,7 +442,7 @@ bn_mul_mont: movl %ecx,(%esi,%ebx,4) movl %edx,(%edi,%ebx,4) decl %ebx - jge .L011copy + jge .L016copy movl 24(%esp),%esp movl $1,%eax .L000just_leave: diff --git a/linux-x86/crypto/cpu-x86-asm.S b/linux-x86/crypto/cpu-x86-asm.S index b6f767b..24a8dd4 100644 --- a/linux-x86/crypto/cpu-x86-asm.S +++ b/linux-x86/crypto/cpu-x86-asm.S @@ -101,10 +101,6 @@ OPENSSL_ia32_cpuid: cmpl $0,%ebp jne .L005notintel orl $1073741824,%edx - andb $15,%ah - cmpb $15,%ah - jne .L005notintel - orl $1048576,%edx .L005notintel: btl $28,%edx jnc .L002generic @@ -241,6 +237,18 @@ OPENSSL_wipe_cpu: movl (%ecx),%ecx btl $1,(%ecx) jnc .L016no_x87 + andl $83886080,%ecx + cmpl $83886080,%ecx + jne .L017no_sse2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 +.L017no_sse2: .long 4007259865,4007259865,4007259865,4007259865,2430851995 .L016no_x87: leal 4(%esp),%eax @@ -257,11 +265,11 @@ OPENSSL_atomic_add: pushl %ebx nop movl (%edx),%eax -.L017spin: +.L018spin: leal (%eax,%ecx,1),%ebx nop .long 447811568 - jne .L017spin + jne .L018spin movl %ebx,%eax popl %ebx ret @@ -301,11 +309,11 @@ OPENSSL_indirect_call: OPENSSL_ia32_rdrand: .L_OPENSSL_ia32_rdrand_begin: movl $8,%ecx -.L018loop: +.L019loop: .byte 15,199,240 - jc .L019break - loop .L018loop -.L019break: + jc .L020break + loop .L019loop +.L020break: cmpl $0,%eax cmovel %ecx,%eax ret diff --git a/linux-x86/crypto/sha/sha1-586.S b/linux-x86/crypto/sha/sha1-586.S index 71ae5b3..808ccac 100644 --- a/linux-x86/crypto/sha/sha1-586.S +++ b/linux-x86/crypto/sha/sha1-586.S @@ -11,6 +11,23 @@ sha1_block_data_order: pushl %ebx pushl %esi pushl %edi + call .L000pic_point +.L000pic_point: + popl %ebp + leal OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi + leal .LK_XX_XX-.L000pic_point(%ebp),%ebp + movl (%esi),%eax + movl 4(%esi),%edx + testl $512,%edx + jz .L001x86 + movl 8(%esi),%ecx + testl $16777216,%eax + jz .L001x86 + testl $536870912,%ecx + jnz .Lshaext_shortcut + jmp .Lssse3_shortcut +.align 16 +.L001x86: movl 20(%esp),%ebp movl 24(%esp),%esi movl 28(%esp),%eax @@ -19,9 +36,9 @@ sha1_block_data_order: addl %esi,%eax movl %eax,104(%esp) movl 16(%ebp),%edi - jmp .L000loop + jmp .L002loop .align 16 -.L000loop: +.L002loop: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -1368,7 +1385,7 @@ sha1_block_data_order: movl %ebx,12(%ebp) movl %edx,%esi movl %ecx,16(%ebp) - jb .L000loop + jb .L002loop addl $76,%esp popl %edi popl %esi @@ -1376,6 +1393,1405 @@ sha1_block_data_order: popl %ebp ret .size sha1_block_data_order,.-.L_sha1_block_data_order_begin +.hidden _sha1_block_data_order_shaext +.type _sha1_block_data_order_shaext,@function +.align 16 +_sha1_block_data_order_shaext: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L003pic_point +.L003pic_point: + popl %ebp + leal .LK_XX_XX-.L003pic_point(%ebp),%ebp +.Lshaext_shortcut: + movl 20(%esp),%edi + movl %esp,%ebx + movl 24(%esp),%esi + movl 28(%esp),%ecx + subl $32,%esp + movdqu (%edi),%xmm0 + movd 16(%edi),%xmm1 + andl $-32,%esp + movdqa 80(%ebp),%xmm3 + movdqu (%esi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%esi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%esi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%esi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 +.byte 102,15,56,0,251 + jmp .L004loop_shaext +.align 16 +.L004loop_shaext: + decl %ecx + leal 64(%esi),%eax + movdqa %xmm1,(%esp) + paddd %xmm4,%xmm1 + cmovnel %eax,%esi + movdqa %xmm0,16(%esp) +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 + movdqu (%esi),%xmm4 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%esi),%xmm5 +.byte 102,15,56,0,227 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%esi),%xmm6 +.byte 102,15,56,0,235 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%esi),%xmm7 +.byte 102,15,56,0,243 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 + movdqa (%esp),%xmm2 +.byte 102,15,56,0,251 +.byte 15,56,200,202 + paddd 16(%esp),%xmm0 + jnz .L004loop_shaext + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%edi) + movd %xmm1,16(%edi) + movl %ebx,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext +.hidden _sha1_block_data_order_ssse3 +.type _sha1_block_data_order_ssse3,@function +.align 16 +_sha1_block_data_order_ssse3: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L005pic_point +.L005pic_point: + popl %ebp + leal .LK_XX_XX-.L005pic_point(%ebp),%ebp +.Lssse3_shortcut: + movdqa (%ebp),%xmm7 + movdqa 16(%ebp),%xmm0 + movdqa 32(%ebp),%xmm1 + movdqa 48(%ebp),%xmm2 + movdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + movdqa %xmm0,112(%esp) + movdqa %xmm1,128(%esp) + movdqa %xmm2,144(%esp) + shll $6,%edx + movdqa %xmm7,160(%esp) + addl %ebp,%edx + movdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + movdqu -64(%ebp),%xmm0 + movdqu -48(%ebp),%xmm1 + movdqu -32(%ebp),%xmm2 + movdqu -16(%ebp),%xmm3 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + movdqa %xmm7,96(%esp) +.byte 102,15,56,0,222 + paddd %xmm7,%xmm0 + paddd %xmm7,%xmm1 + paddd %xmm7,%xmm2 + movdqa %xmm0,(%esp) + psubd %xmm7,%xmm0 + movdqa %xmm1,16(%esp) + psubd %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + movl %ecx,%ebp + psubd %xmm7,%xmm2 + xorl %edx,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebp,%esi + jmp .L006loop +.align 16 +.L006loop: + rorl $2,%ebx + xorl %edx,%esi + movl %eax,%ebp + punpcklqdq %xmm1,%xmm4 + movdqa %xmm3,%xmm6 + addl (%esp),%edi + xorl %ecx,%ebx + paddd %xmm3,%xmm7 + movdqa %xmm0,64(%esp) + roll $5,%eax + addl %esi,%edi + psrldq $4,%xmm6 + andl %ebx,%ebp + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%edi + rorl $7,%eax + pxor %xmm2,%xmm6 + xorl %ecx,%ebp + movl %edi,%esi + addl 4(%esp),%edx + pxor %xmm6,%xmm4 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm7,48(%esp) + addl %ebp,%edx + andl %eax,%esi + movdqa %xmm4,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + movdqa %xmm4,%xmm6 + xorl %ebx,%esi + pslldq $12,%xmm0 + paddd %xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + psrld $31,%xmm6 + xorl %eax,%edi + roll $5,%edx + movdqa %xmm0,%xmm7 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + psrld $30,%xmm0 + addl %edx,%ecx + rorl $7,%edx + por %xmm6,%xmm4 + xorl %eax,%ebp + movl %ecx,%esi + addl 12(%esp),%ebx + pslld $2,%xmm7 + xorl %edi,%edx + roll $5,%ecx + pxor %xmm0,%xmm4 + movdqa 96(%esp),%xmm0 + addl %ebp,%ebx + andl %edx,%esi + pxor %xmm7,%xmm4 + pshufd $238,%xmm1,%xmm5 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + xorl %edi,%esi + movl %ebx,%ebp + punpcklqdq %xmm2,%xmm5 + movdqa %xmm4,%xmm7 + addl 16(%esp),%eax + xorl %edx,%ecx + paddd %xmm4,%xmm0 + movdqa %xmm1,80(%esp) + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm7 + andl %ecx,%ebp + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm7 + xorl %edx,%ebp + movl %eax,%esi + addl 20(%esp),%edi + pxor %xmm7,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm0,(%esp) + addl %ebp,%edi + andl %ebx,%esi + movdqa %xmm5,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + movdqa %xmm5,%xmm7 + xorl %ecx,%esi + pslldq $12,%xmm1 + paddd %xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + psrld $31,%xmm7 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + psrld $30,%xmm1 + addl %edi,%edx + rorl $7,%edi + por %xmm7,%xmm5 + xorl %ebx,%ebp + movl %edx,%esi + addl 28(%esp),%ecx + pslld $2,%xmm0 + xorl %eax,%edi + roll $5,%edx + pxor %xmm1,%xmm5 + movdqa 112(%esp),%xmm1 + addl %ebp,%ecx + andl %edi,%esi + pxor %xmm0,%xmm5 + pshufd $238,%xmm2,%xmm6 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + punpcklqdq %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + addl 32(%esp),%ebx + xorl %edi,%edx + paddd %xmm5,%xmm1 + movdqa %xmm2,96(%esp) + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm0 + andl %edx,%ebp + xorl %edi,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm0 + xorl %edi,%ebp + movl %ebx,%esi + addl 36(%esp),%eax + pxor %xmm0,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm1,16(%esp) + addl %ebp,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm0 + xorl %edx,%esi + pslldq $12,%xmm2 + paddd %xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + psrld $31,%xmm0 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm2,%xmm1 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + psrld $30,%xmm2 + addl %eax,%edi + rorl $7,%eax + por %xmm0,%xmm6 + xorl %ecx,%ebp + movdqa 64(%esp),%xmm0 + movl %edi,%esi + addl 44(%esp),%edx + pslld $2,%xmm1 + xorl %ebx,%eax + roll $5,%edi + pxor %xmm2,%xmm6 + movdqa 112(%esp),%xmm2 + addl %ebp,%edx + andl %eax,%esi + pxor %xmm1,%xmm6 + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%esi + movl %edx,%ebp + punpcklqdq %xmm4,%xmm7 + movdqa %xmm6,%xmm1 + addl 48(%esp),%ecx + xorl %eax,%edi + paddd %xmm6,%xmm2 + movdqa %xmm3,64(%esp) + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm1 + andl %edi,%ebp + xorl %eax,%edi + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm1 + xorl %eax,%ebp + movl %ecx,%esi + addl 52(%esp),%ebx + pxor %xmm1,%xmm7 + xorl %edi,%edx + roll $5,%ecx + movdqa %xmm2,32(%esp) + addl %ebp,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm1 + xorl %edi,%esi + pslldq $12,%xmm3 + paddd %xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + psrld $31,%xmm1 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm3,%xmm2 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + psrld $30,%xmm3 + addl %ebx,%eax + rorl $7,%ebx + por %xmm1,%xmm7 + xorl %edx,%ebp + movdqa 80(%esp),%xmm1 + movl %eax,%esi + addl 60(%esp),%edi + pslld $2,%xmm2 + xorl %ecx,%ebx + roll $5,%eax + pxor %xmm3,%xmm7 + movdqa 112(%esp),%xmm3 + addl %ebp,%edi + andl %ebx,%esi + pxor %xmm2,%xmm7 + pshufd $238,%xmm6,%xmm2 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,80(%esp) + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm3,%xmm4 + addl %esi,%edx + paddd %xmm7,%xmm3 + andl %eax,%ebp + pxor %xmm2,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + roll $5,%edx + pslld $2,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + psrld $30,%xmm2 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + xorl %edi,%edx + roll $5,%ecx + por %xmm2,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + movdqa 96(%esp),%xmm2 + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + pshufd $238,%xmm7,%xmm3 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 16(%esp),%edi + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm2,%xmm1 + movdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm4,%xmm5 + rorl $7,%ebx + paddd %xmm0,%xmm4 + addl %eax,%edi + pxor %xmm3,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm3 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm3,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + movdqa 64(%esp),%xmm3 + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + pshufd $238,%xmm0,%xmm4 + addl %ecx,%ebx + addl 32(%esp),%eax + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + pxor %xmm3,%xmm2 + movdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + movdqa 128(%esp),%xmm6 + rorl $7,%ecx + paddd %xmm1,%xmm5 + addl %ebx,%eax + pxor %xmm4,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + pslld $2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + psrld $30,%xmm4 + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + por %xmm4,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + movdqa 80(%esp),%xmm4 + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + pshufd $238,%xmm1,%xmm5 + addl %edx,%ecx + addl 48(%esp),%ebx + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + pxor %xmm4,%xmm3 + movdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%edx + paddd %xmm2,%xmm6 + addl %ecx,%ebx + pxor %xmm5,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pslld $2,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + psrld $30,%xmm5 + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + por %xmm5,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + movdqa 96(%esp),%xmm5 + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + pshufd $238,%xmm2,%xmm6 + addl %edi,%edx + addl (%esp),%ecx + pxor %xmm0,%xmm4 + punpcklqdq %xmm3,%xmm6 + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + pxor %xmm5,%xmm4 + movdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + movdqa %xmm7,%xmm0 + rorl $7,%edi + paddd %xmm3,%xmm7 + addl %edx,%ecx + pxor %xmm6,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm6 + movdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + pslld $2,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + psrld $30,%xmm6 + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + por %xmm6,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + movdqa 64(%esp),%xmm6 + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + pshufd $238,%xmm3,%xmm7 + addl %eax,%edi + addl 16(%esp),%edx + pxor %xmm1,%xmm5 + punpcklqdq %xmm4,%xmm7 + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + pxor %xmm6,%xmm5 + movdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + movdqa %xmm0,%xmm1 + rorl $7,%eax + paddd %xmm4,%xmm0 + addl %edi,%edx + pxor %xmm7,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm7 + movdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + pslld $2,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + psrld $30,%xmm7 + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + por %xmm7,%xmm5 + addl 28(%esp),%eax + movdqa 80(%esp),%xmm7 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pshufd $238,%xmm4,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 32(%esp),%edi + pxor %xmm2,%xmm6 + punpcklqdq %xmm5,%xmm0 + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + pxor %xmm7,%xmm6 + movdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + roll $5,%eax + movdqa %xmm1,%xmm2 + addl %esi,%edi + paddd %xmm5,%xmm1 + xorl %ebx,%ebp + pxor %xmm0,%xmm6 + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + andl %ebx,%ebp + movdqa %xmm6,%xmm0 + movdqa %xmm1,16(%esp) + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + pslld $2,%xmm6 + addl %ebp,%edx + xorl %eax,%esi + psrld $30,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + por %xmm0,%xmm6 + movl %edx,%ebp + xorl %eax,%esi + movdqa 96(%esp),%xmm0 + roll $5,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + pshufd $238,%xmm5,%xmm1 + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 48(%esp),%eax + pxor %xmm3,%xmm7 + punpcklqdq %xmm6,%xmm1 + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + pxor %xmm0,%xmm7 + movdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + roll $5,%ebx + movdqa 144(%esp),%xmm3 + addl %esi,%eax + paddd %xmm6,%xmm2 + xorl %ecx,%ebp + pxor %xmm1,%xmm7 + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + andl %ecx,%ebp + movdqa %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + pslld $2,%xmm7 + addl %ebp,%edi + xorl %ebx,%esi + psrld $30,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + por %xmm1,%xmm7 + movl %edi,%ebp + xorl %ebx,%esi + movdqa 64(%esp),%xmm1 + roll $5,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + pshufd $238,%xmm6,%xmm2 + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl (%esp),%ebx + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + roll $5,%ecx + movdqa %xmm3,%xmm4 + addl %esi,%ebx + paddd %xmm7,%xmm3 + xorl %edx,%ebp + pxor %xmm2,%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + andl %edx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pslld $2,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + psrld $30,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + por %xmm2,%xmm0 + movl %eax,%ebp + xorl %ecx,%esi + movdqa 80(%esp),%xmm2 + roll $5,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + pshufd $238,%xmm7,%xmm3 + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 16(%esp),%ecx + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + pxor %xmm2,%xmm1 + movdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + roll $5,%edx + movdqa %xmm4,%xmm5 + addl %esi,%ecx + paddd %xmm0,%xmm4 + xorl %edi,%ebp + pxor %xmm3,%xmm1 + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + andl %edi,%ebp + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + pslld $2,%xmm1 + addl %ebp,%ebx + xorl %edx,%esi + psrld $30,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + por %xmm3,%xmm1 + movl %ebx,%ebp + xorl %edx,%esi + movdqa 96(%esp),%xmm3 + roll $5,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + pshufd $238,%xmm0,%xmm4 + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 32(%esp),%edx + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + pxor %xmm3,%xmm2 + movdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + roll $5,%edi + movdqa %xmm5,%xmm6 + addl %esi,%edx + paddd %xmm1,%xmm5 + xorl %eax,%ebp + pxor %xmm4,%xmm2 + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + andl %eax,%ebp + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + pslld $2,%xmm2 + addl %ebp,%ecx + xorl %edi,%esi + psrld $30,%xmm4 + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + por %xmm4,%xmm2 + movl %ecx,%ebp + xorl %edi,%esi + movdqa 64(%esp),%xmm4 + roll $5,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + pshufd $238,%xmm1,%xmm5 + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + addl 48(%esp),%edi + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm4,%xmm3 + movdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%ebx + paddd %xmm2,%xmm6 + addl %eax,%edi + pxor %xmm5,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm5 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm5,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl (%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + paddd %xmm3,%xmm7 + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + movdqa %xmm7,48(%esp) + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L007done + movdqa 160(%esp),%xmm7 + movdqa 176(%esp),%xmm6 + movdqu (%ebp),%xmm0 + movdqu 16(%ebp),%xmm1 + movdqu 32(%ebp),%xmm2 + movdqu 48(%ebp),%xmm3 + addl $64,%ebp +.byte 102,15,56,0,198 + movl %ebp,196(%esp) + movdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx +.byte 102,15,56,0,206 + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + paddd %xmm7,%xmm0 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + movdqa %xmm0,(%esp) + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + psubd %xmm7,%xmm0 + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi +.byte 102,15,56,0,214 + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + paddd %xmm7,%xmm1 + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + movdqa %xmm1,16(%esp) + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + psubd %xmm7,%xmm1 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax +.byte 102,15,56,0,222 + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + paddd %xmm7,%xmm2 + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + movdqa %xmm2,32(%esp) + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + psubd %xmm7,%xmm2 + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %ecx,%ebx + movl %edx,12(%ebp) + xorl %edx,%ebx + movl %edi,16(%ebp) + movl %esi,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebx,%esi + movl %ebp,%ebx + jmp .L006loop +.align 16 +.L007done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.align 64 +.LK_XX_XX: +.long 1518500249,1518500249,1518500249,1518500249 +.long 1859775393,1859775393,1859775393,1859775393 +.long 2400959708,2400959708,2400959708,2400959708 +.long 3395469782,3395469782,3395469782,3395469782 +.long 66051,67438087,134810123,202182159 +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 .byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 diff --git a/linux-x86/crypto/sha/sha256-586.S b/linux-x86/crypto/sha/sha256-586.S index fe41afc..08d9484 100644 --- a/linux-x86/crypto/sha/sha256-586.S +++ b/linux-x86/crypto/sha/sha256-586.S @@ -27,6 +27,27 @@ sha256_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) + leal OPENSSL_ia32cap_P-.L001K256(%ebp),%edx + movl (%edx),%ecx + movl 4(%edx),%ebx + testl $1048576,%ecx + jnz .L002loop + movl 8(%edx),%edx + testl $16777216,%ecx + jz .L003no_xmm + andl $1073741824,%ecx + andl $268435968,%ebx + testl $536870912,%edx + jnz .L004shaext + orl %ebx,%ecx + andl $1342177280,%ecx + cmpl $1342177280,%ecx + testl $512,%ebx + jnz .L005SSSE3 +.L003no_xmm: + subl %edi,%eax + cmpl $256,%eax + jae .L006unrolled jmp .L002loop .align 16 .L002loop: @@ -98,7 +119,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00300_15: +.L00700_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -136,11 +157,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00300_15 + jne .L00700_15 movl 156(%esp),%ecx - jmp .L00416_63 + jmp .L00816_63 .align 16 -.L00416_63: +.L00816_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -195,7 +216,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00416_63 + jne .L00816_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -229,207 +250,6 @@ sha256_block_data_order: popl %ebx popl %ebp ret -.align 32 -.L005loop_shrd: - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - bswap %eax - movl 12(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 16(%edi),%eax - movl 20(%edi),%ebx - movl 24(%edi),%ecx - bswap %eax - movl 28(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 32(%edi),%eax - movl 36(%edi),%ebx - movl 40(%edi),%ecx - bswap %eax - movl 44(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 48(%edi),%eax - movl 52(%edi),%ebx - movl 56(%edi),%ecx - bswap %eax - movl 60(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - addl $64,%edi - leal -36(%esp),%esp - movl %edi,104(%esp) - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,8(%esp) - xorl %ecx,%ebx - movl %ecx,12(%esp) - movl %edi,16(%esp) - movl %ebx,(%esp) - movl 16(%esi),%edx - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edi - movl %ebx,24(%esp) - movl %ecx,28(%esp) - movl %edi,32(%esp) -.align 16 -.L00600_15_shrd: - movl %edx,%ecx - movl 24(%esp),%esi - shrdl $14,%ecx,%ecx - movl 28(%esp),%edi - xorl %edx,%ecx - xorl %edi,%esi - movl 96(%esp),%ebx - shrdl $5,%ecx,%ecx - andl %edx,%esi - movl %edx,20(%esp) - xorl %ecx,%edx - addl 32(%esp),%ebx - xorl %edi,%esi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %esi,%ebx - shrdl $9,%ecx,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - shrdl $11,%ecx,%ecx - movl (%ebp),%esi - xorl %eax,%ecx - movl 20(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax - addl %ecx,%ebx - xorl %edi,%eax - addl $4,%ebp - addl %ebx,%eax - cmpl $3248222580,%esi - jne .L00600_15_shrd - movl 156(%esp),%ecx - jmp .L00716_63_shrd -.align 16 -.L00716_63_shrd: - movl %ecx,%ebx - movl 104(%esp),%esi - shrdl $11,%ecx,%ecx - movl %esi,%edi - shrdl $2,%esi,%esi - xorl %ebx,%ecx - shrl $3,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - shrdl $17,%esi,%esi - addl 160(%esp),%ebx - shrl $10,%edi - addl 124(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 24(%esp),%esi - shrdl $14,%ecx,%ecx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %edx,%ecx - xorl %edi,%esi - movl %ebx,96(%esp) - shrdl $5,%ecx,%ecx - andl %edx,%esi - movl %edx,20(%esp) - xorl %ecx,%edx - addl 32(%esp),%ebx - xorl %edi,%esi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %esi,%ebx - shrdl $9,%ecx,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - shrdl $11,%ecx,%ecx - movl (%ebp),%esi - xorl %eax,%ecx - movl 20(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax - addl %ecx,%ebx - xorl %edi,%eax - movl 156(%esp),%ecx - addl $4,%ebp - addl %ebx,%eax - cmpl $3329325298,%esi - jne .L00716_63_shrd - movl 356(%esp),%esi - movl 8(%esp),%ebx - movl 16(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl 24(%esp),%eax - movl 28(%esp),%ebx - movl 32(%esp),%ecx - movl 360(%esp),%edi - addl 16(%esi),%edx - addl 20(%esi),%eax - addl 24(%esi),%ebx - addl 28(%esi),%ecx - movl %edx,16(%esi) - movl %eax,20(%esi) - movl %ebx,24(%esi) - movl %ecx,28(%esi) - leal 356(%esp),%esp - subl $256,%ebp - cmpl 8(%esp),%edi - jb .L005loop_shrd - movl 12(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret .align 64 .L001K256: .long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 @@ -440,7 +260,7 @@ sha256_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L008unrolled: +.L006unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -3346,5 +3166,1414 @@ sha256_block_data_order: popl %ebx popl %ebp ret +.align 32 +.L004shaext: + subl $32,%esp + movdqu (%esi),%xmm1 + leal 128(%ebp),%ebp + movdqu 16(%esi),%xmm2 + movdqa 128(%ebp),%xmm7 + pshufd $27,%xmm1,%xmm0 + pshufd $177,%xmm1,%xmm1 + pshufd $27,%xmm2,%xmm2 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .L010loop_shaext +.align 16 +.L010loop_shaext: + movdqu (%edi),%xmm3 + movdqu 16(%edi),%xmm4 + movdqu 32(%edi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%edi),%xmm6 + movdqa %xmm2,16(%esp) + movdqa -128(%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + nop + movdqa %xmm1,(%esp) +.byte 15,56,203,202 + movdqa -112(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + leal 64(%edi),%edi +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa -96(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa -80(%ebp),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa -64(%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa -48(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa -32(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa -16(%ebp),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa (%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 16(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 32(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 48(%ebp),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64(%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + movdqa 96(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa 128(%ebp),%xmm7 +.byte 15,56,203,202 + movdqa 112(%ebp),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + cmpl %edi,%eax + nop +.byte 15,56,203,202 + paddd 16(%esp),%xmm2 + paddd (%esp),%xmm1 + jnz .L010loop_shaext + pshufd $177,%xmm2,%xmm2 + pshufd $27,%xmm1,%xmm7 + pshufd $177,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + movl 44(%esp),%esp + movdqu %xmm1,(%esi) + movdqu %xmm2,16(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L005SSSE3: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + movdqa 256(%ebp),%xmm7 + jmp .L011grand_ssse3 +.align 16 +.L011grand_ssse3: + movdqu (%edi),%xmm0 + movdqu 16(%edi),%xmm1 + movdqu 32(%edi),%xmm2 + movdqu 48(%edi),%xmm3 + addl $64,%edi +.byte 102,15,56,0,199 + movl %edi,100(%esp) +.byte 102,15,56,0,207 + movdqa (%ebp),%xmm4 +.byte 102,15,56,0,215 + movdqa 16(%ebp),%xmm5 + paddd %xmm0,%xmm4 +.byte 102,15,56,0,223 + movdqa 32(%ebp),%xmm6 + paddd %xmm1,%xmm5 + movdqa 48(%ebp),%xmm7 + movdqa %xmm4,32(%esp) + paddd %xmm2,%xmm6 + movdqa %xmm5,48(%esp) + paddd %xmm3,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + jmp .L012ssse3_00_47 +.align 16 +.L012ssse3_00_47: + addl $64,%ebp + movl %edx,%ecx + movdqa %xmm1,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi +.byte 102,15,58,15,224,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,250,4 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm3,%xmm7 + xorl %esi,%ecx + addl 32(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm0 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm0 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm0,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa (%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm0,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,32(%esp) + movl %edx,%ecx + movdqa %xmm2,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi +.byte 102,15,58,15,225,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,251,4 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm0,%xmm7 + xorl %esi,%ecx + addl 48(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm1 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm1 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm1,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 16(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm1,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,48(%esp) + movl %edx,%ecx + movdqa %xmm3,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi +.byte 102,15,58,15,226,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,248,4 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm1,%xmm7 + xorl %esi,%ecx + addl 64(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm2 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm2 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm2,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 32(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm2,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,64(%esp) + movl %edx,%ecx + movdqa %xmm0,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi +.byte 102,15,58,15,227,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,249,4 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm2,%xmm7 + xorl %esi,%ecx + addl 80(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm3 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm3 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm3,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 48(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm3,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L012ssse3_00_47 + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + movdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L011grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin #endif diff --git a/linux-x86/crypto/sha/sha512-586.S b/linux-x86/crypto/sha/sha512-586.S index 17f7945..a928400 100644 --- a/linux-x86/crypto/sha/sha512-586.S +++ b/linux-x86/crypto/sha/sha512-586.S @@ -27,6 +27,2269 @@ sha512_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) + leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx + movl (%edx),%ecx + testl $67108864,%ecx + jz .L002loop_x86 + movl 4(%edx),%edx + movq (%esi),%mm0 + andl $16777216,%ecx + movq 8(%esi),%mm1 + andl $512,%edx + movq 16(%esi),%mm2 + orl %edx,%ecx + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + cmpl $16777728,%ecx + je .L003SSSE3 + subl $80,%esp + jmp .L004loop_sse2 +.align 16 +.L004loop_sse2: + movq %mm1,8(%esp) + movq %mm2,16(%esp) + movq %mm3,24(%esp) + movq %mm5,40(%esp) + movq %mm6,48(%esp) + pxor %mm1,%mm2 + movq %mm7,56(%esp) + movq %mm0,%mm3 + movl (%edi),%eax + movl 4(%edi),%ebx + addl $8,%edi + movl $15,%edx + bswap %eax + bswap %ebx + jmp .L00500_14_sse2 +.align 16 +.L00500_14_sse2: + movd %eax,%mm1 + movl (%edi),%eax + movd %ebx,%mm7 + movl 4(%edi),%ebx + addl $8,%edi + bswap %eax + bswap %ebx + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + movq 48(%esp),%mm6 + decl %edx + jnz .L00500_14_sse2 + movd %eax,%mm1 + movd %ebx,%mm7 + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + pxor %mm0,%mm0 + movl $32,%edx + jmp .L00616_79_sse2 +.align 16 +.L00616_79_sse2: + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm0 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm2 + addl $8,%ebp + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm2 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm0 + addl $8,%ebp + decl %edx + jnz .L00616_79_sse2 + paddq %mm3,%mm0 + movq 8(%esp),%mm1 + movq 24(%esp),%mm3 + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movl $640,%eax + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + leal (%esp,%eax,1),%esp + subl %eax,%ebp + cmpl 88(%esp),%edi + jb .L004loop_sse2 + movl 92(%esp),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L003SSSE3: + leal -64(%esp),%edx + subl $256,%esp + movdqa 640(%ebp),%xmm1 + movdqu (%edi),%xmm0 +.byte 102,15,56,0,193 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%edi),%xmm1 + paddq %xmm0,%xmm3 +.byte 102,15,56,0,202 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%edi),%xmm2 + paddq %xmm1,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%edi),%xmm3 + paddq %xmm2,%xmm5 +.byte 102,15,56,0,220 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%edi),%xmm4 + paddq %xmm3,%xmm6 +.byte 102,15,56,0,229 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%edi),%xmm5 + paddq %xmm4,%xmm7 +.byte 102,15,56,0,238 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%edi),%xmm6 + paddq %xmm5,%xmm0 +.byte 102,15,56,0,247 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%edi),%xmm7 + paddq %xmm6,%xmm1 +.byte 102,15,56,0,248 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movdqa %xmm2,-16(%edx) + nop +.align 32 +.L007loop_ssse3: + movdqa 16(%edx),%xmm2 + movdqa %xmm3,48(%edx) + leal 128(%ebp),%ebp + movq %mm1,8(%esp) + movl %edi,%ebx + movq %mm2,16(%esp) + leal 128(%edi),%edi + movq %mm3,24(%esp) + cmpl %eax,%edi + movq %mm5,40(%esp) + cmovbl %edi,%ebx + movq %mm6,48(%esp) + movl $4,%ecx + pxor %mm1,%mm2 + movq %mm7,56(%esp) + pxor %mm3,%mm3 + jmp .L00800_47_ssse3 +.align 32 +.L00800_47_ssse3: + movdqa %xmm5,%xmm3 + movdqa %xmm2,%xmm1 +.byte 102,15,58,15,208,8 + movdqa %xmm4,(%edx) +.byte 102,15,58,15,220,8 + movdqa %xmm2,%xmm4 + psrlq $7,%xmm2 + paddq %xmm3,%xmm0 + movdqa %xmm4,%xmm3 + psrlq $1,%xmm4 + psllq $56,%xmm3 + pxor %xmm4,%xmm2 + psrlq $7,%xmm4 + pxor %xmm3,%xmm2 + psllq $7,%xmm3 + pxor %xmm4,%xmm2 + movdqa %xmm7,%xmm4 + pxor %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + psrlq $6,%xmm4 + paddq %xmm2,%xmm0 + movdqa %xmm7,%xmm2 + psrlq $19,%xmm3 + psllq $3,%xmm2 + pxor %xmm3,%xmm4 + psrlq $42,%xmm3 + pxor %xmm2,%xmm4 + psllq $42,%xmm2 + pxor %xmm3,%xmm4 + movdqa 32(%edx),%xmm3 + pxor %xmm2,%xmm4 + movdqa (%ebp),%xmm2 + movq %mm4,%mm1 + paddq %xmm4,%xmm0 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm0,%xmm2 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm2,-128(%edx) + movdqa %xmm6,%xmm4 + movdqa %xmm3,%xmm2 +.byte 102,15,58,15,217,8 + movdqa %xmm5,16(%edx) +.byte 102,15,58,15,229,8 + movdqa %xmm3,%xmm5 + psrlq $7,%xmm3 + paddq %xmm4,%xmm1 + movdqa %xmm5,%xmm4 + psrlq $1,%xmm5 + psllq $56,%xmm4 + pxor %xmm5,%xmm3 + psrlq $7,%xmm5 + pxor %xmm4,%xmm3 + psllq $7,%xmm4 + pxor %xmm5,%xmm3 + movdqa %xmm0,%xmm5 + pxor %xmm4,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $6,%xmm5 + paddq %xmm3,%xmm1 + movdqa %xmm0,%xmm3 + psrlq $19,%xmm4 + psllq $3,%xmm3 + pxor %xmm4,%xmm5 + psrlq $42,%xmm4 + pxor %xmm3,%xmm5 + psllq $42,%xmm3 + pxor %xmm4,%xmm5 + movdqa 48(%edx),%xmm4 + pxor %xmm3,%xmm5 + movdqa 16(%ebp),%xmm3 + movq %mm4,%mm1 + paddq %xmm5,%xmm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm1,%xmm3 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm3,-112(%edx) + movdqa %xmm7,%xmm5 + movdqa %xmm4,%xmm3 +.byte 102,15,58,15,226,8 + movdqa %xmm6,32(%edx) +.byte 102,15,58,15,238,8 + movdqa %xmm4,%xmm6 + psrlq $7,%xmm4 + paddq %xmm5,%xmm2 + movdqa %xmm6,%xmm5 + psrlq $1,%xmm6 + psllq $56,%xmm5 + pxor %xmm6,%xmm4 + psrlq $7,%xmm6 + pxor %xmm5,%xmm4 + psllq $7,%xmm5 + pxor %xmm6,%xmm4 + movdqa %xmm1,%xmm6 + pxor %xmm5,%xmm4 + movdqa %xmm1,%xmm5 + psrlq $6,%xmm6 + paddq %xmm4,%xmm2 + movdqa %xmm1,%xmm4 + psrlq $19,%xmm5 + psllq $3,%xmm4 + pxor %xmm5,%xmm6 + psrlq $42,%xmm5 + pxor %xmm4,%xmm6 + psllq $42,%xmm4 + pxor %xmm5,%xmm6 + movdqa (%edx),%xmm5 + pxor %xmm4,%xmm6 + movdqa 32(%ebp),%xmm4 + movq %mm4,%mm1 + paddq %xmm6,%xmm2 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm2,%xmm4 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm4,-96(%edx) + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm4 +.byte 102,15,58,15,235,8 + movdqa %xmm7,48(%edx) +.byte 102,15,58,15,247,8 + movdqa %xmm5,%xmm7 + psrlq $7,%xmm5 + paddq %xmm6,%xmm3 + movdqa %xmm7,%xmm6 + psrlq $1,%xmm7 + psllq $56,%xmm6 + pxor %xmm7,%xmm5 + psrlq $7,%xmm7 + pxor %xmm6,%xmm5 + psllq $7,%xmm6 + pxor %xmm7,%xmm5 + movdqa %xmm2,%xmm7 + pxor %xmm6,%xmm5 + movdqa %xmm2,%xmm6 + psrlq $6,%xmm7 + paddq %xmm5,%xmm3 + movdqa %xmm2,%xmm5 + psrlq $19,%xmm6 + psllq $3,%xmm5 + pxor %xmm6,%xmm7 + psrlq $42,%xmm6 + pxor %xmm5,%xmm7 + psllq $42,%xmm5 + pxor %xmm6,%xmm7 + movdqa 16(%edx),%xmm6 + pxor %xmm5,%xmm7 + movdqa 48(%ebp),%xmm5 + movq %mm4,%mm1 + paddq %xmm7,%xmm3 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm3,%xmm5 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm5,-80(%edx) + movdqa %xmm1,%xmm7 + movdqa %xmm6,%xmm5 +.byte 102,15,58,15,244,8 + movdqa %xmm0,(%edx) +.byte 102,15,58,15,248,8 + movdqa %xmm6,%xmm0 + psrlq $7,%xmm6 + paddq %xmm7,%xmm4 + movdqa %xmm0,%xmm7 + psrlq $1,%xmm0 + psllq $56,%xmm7 + pxor %xmm0,%xmm6 + psrlq $7,%xmm0 + pxor %xmm7,%xmm6 + psllq $7,%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm3,%xmm0 + pxor %xmm7,%xmm6 + movdqa %xmm3,%xmm7 + psrlq $6,%xmm0 + paddq %xmm6,%xmm4 + movdqa %xmm3,%xmm6 + psrlq $19,%xmm7 + psllq $3,%xmm6 + pxor %xmm7,%xmm0 + psrlq $42,%xmm7 + pxor %xmm6,%xmm0 + psllq $42,%xmm6 + pxor %xmm7,%xmm0 + movdqa 32(%edx),%xmm7 + pxor %xmm6,%xmm0 + movdqa 64(%ebp),%xmm6 + movq %mm4,%mm1 + paddq %xmm0,%xmm4 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm4,%xmm6 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm6,-64(%edx) + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm6 +.byte 102,15,58,15,253,8 + movdqa %xmm1,16(%edx) +.byte 102,15,58,15,193,8 + movdqa %xmm7,%xmm1 + psrlq $7,%xmm7 + paddq %xmm0,%xmm5 + movdqa %xmm1,%xmm0 + psrlq $1,%xmm1 + psllq $56,%xmm0 + pxor %xmm1,%xmm7 + psrlq $7,%xmm1 + pxor %xmm0,%xmm7 + psllq $7,%xmm0 + pxor %xmm1,%xmm7 + movdqa %xmm4,%xmm1 + pxor %xmm0,%xmm7 + movdqa %xmm4,%xmm0 + psrlq $6,%xmm1 + paddq %xmm7,%xmm5 + movdqa %xmm4,%xmm7 + psrlq $19,%xmm0 + psllq $3,%xmm7 + pxor %xmm0,%xmm1 + psrlq $42,%xmm0 + pxor %xmm7,%xmm1 + psllq $42,%xmm7 + pxor %xmm0,%xmm1 + movdqa 48(%edx),%xmm0 + pxor %xmm7,%xmm1 + movdqa 80(%ebp),%xmm7 + movq %mm4,%mm1 + paddq %xmm1,%xmm5 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm5,%xmm7 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm7,-48(%edx) + movdqa %xmm3,%xmm1 + movdqa %xmm0,%xmm7 +.byte 102,15,58,15,198,8 + movdqa %xmm2,32(%edx) +.byte 102,15,58,15,202,8 + movdqa %xmm0,%xmm2 + psrlq $7,%xmm0 + paddq %xmm1,%xmm6 + movdqa %xmm2,%xmm1 + psrlq $1,%xmm2 + psllq $56,%xmm1 + pxor %xmm2,%xmm0 + psrlq $7,%xmm2 + pxor %xmm1,%xmm0 + psllq $7,%xmm1 + pxor %xmm2,%xmm0 + movdqa %xmm5,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm1 + psrlq $6,%xmm2 + paddq %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $19,%xmm1 + psllq $3,%xmm0 + pxor %xmm1,%xmm2 + psrlq $42,%xmm1 + pxor %xmm0,%xmm2 + psllq $42,%xmm0 + pxor %xmm1,%xmm2 + movdqa (%edx),%xmm1 + pxor %xmm0,%xmm2 + movdqa 96(%ebp),%xmm0 + movq %mm4,%mm1 + paddq %xmm2,%xmm6 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm6,%xmm0 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm0,-32(%edx) + movdqa %xmm4,%xmm2 + movdqa %xmm1,%xmm0 +.byte 102,15,58,15,207,8 + movdqa %xmm3,48(%edx) +.byte 102,15,58,15,211,8 + movdqa %xmm1,%xmm3 + psrlq $7,%xmm1 + paddq %xmm2,%xmm7 + movdqa %xmm3,%xmm2 + psrlq $1,%xmm3 + psllq $56,%xmm2 + pxor %xmm3,%xmm1 + psrlq $7,%xmm3 + pxor %xmm2,%xmm1 + psllq $7,%xmm2 + pxor %xmm3,%xmm1 + movdqa %xmm6,%xmm3 + pxor %xmm2,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $6,%xmm3 + paddq %xmm1,%xmm7 + movdqa %xmm6,%xmm1 + psrlq $19,%xmm2 + psllq $3,%xmm1 + pxor %xmm2,%xmm3 + psrlq $42,%xmm2 + pxor %xmm1,%xmm3 + psllq $42,%xmm1 + pxor %xmm2,%xmm3 + movdqa 16(%edx),%xmm2 + pxor %xmm1,%xmm3 + movdqa 112(%ebp),%xmm1 + movq %mm4,%mm1 + paddq %xmm3,%xmm7 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm7,%xmm1 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm1,-16(%edx) + leal 128(%ebp),%ebp + decl %ecx + jnz .L00800_47_ssse3 + movdqa (%ebp),%xmm1 + leal -640(%ebp),%ebp + movdqu (%ebx),%xmm0 +.byte 102,15,56,0,193 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%ebx),%xmm1 + paddq %xmm0,%xmm3 +.byte 102,15,56,0,202 + movq %mm4,%mm1 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%ebx),%xmm2 + paddq %xmm1,%xmm4 +.byte 102,15,56,0,211 + movq %mm4,%mm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%ebx),%xmm3 + paddq %xmm2,%xmm5 +.byte 102,15,56,0,220 + movq %mm4,%mm1 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%ebx),%xmm4 + paddq %xmm3,%xmm6 +.byte 102,15,56,0,229 + movq %mm4,%mm1 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%ebx),%xmm5 + paddq %xmm4,%xmm7 +.byte 102,15,56,0,238 + movq %mm4,%mm1 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%ebx),%xmm6 + paddq %xmm5,%xmm0 +.byte 102,15,56,0,247 + movq %mm4,%mm1 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%ebx),%xmm7 + paddq %xmm6,%xmm1 +.byte 102,15,56,0,248 + movq %mm4,%mm1 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movq %mm4,%mm1 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm2,-16(%edx) + movq 8(%esp),%mm1 + paddq %mm3,%mm0 + movq 24(%esp),%mm3 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + cmpl %eax,%edi + jb .L007loop_ssse3 + movl 76(%edx),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .align 16 .L002loop_x86: movl (%edi),%eax @@ -132,7 +2395,7 @@ sha512_block_data_order: movl $16,%ecx .long 2784229001 .align 16 -.L00300_15_x86: +.L00900_15_x86: movl 40(%esp),%ecx movl 44(%esp),%edx movl %ecx,%esi @@ -239,9 +2502,9 @@ sha512_block_data_order: subl $8,%esp leal 8(%ebp),%ebp cmpb $148,%dl - jne .L00300_15_x86 + jne .L00900_15_x86 .align 16 -.L00416_79_x86: +.L01016_79_x86: movl 312(%esp),%ecx movl 316(%esp),%edx movl %ecx,%esi @@ -414,7 +2677,7 @@ sha512_block_data_order: subl $8,%esp leal 8(%ebp),%ebp cmpb $23,%dl - jne .L00416_79_x86 + jne .L01016_79_x86 movl 840(%esp),%esi movl 844(%esp),%edi movl (%esi),%eax |