summaryrefslogtreecommitdiffstats
path: root/mac-x86
diff options
context:
space:
mode:
authorAdam Langley <agl@google.com>2015-05-11 17:20:37 -0700
committerKenny Root <kroot@google.com>2015-05-12 23:06:14 +0000
commite9ada863a7b3e81f5d2b1e3bdd2305da902a87f5 (patch)
tree6e43e34595ecf887c26c32b86d8ab097fe8cac64 /mac-x86
parentb3106a0cc1493bbe0505c0ec0ce3da4ca90a29ae (diff)
downloadexternal_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.zip
external_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.tar.gz
external_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.tar.bz2
external/boringssl: bump revision.
This change bumps the BoringSSL revision to the current tip-of-tree. Change-Id: I91d5bf467e16e8d86cb19a4de873985f524e5faa
Diffstat (limited to 'mac-x86')
-rw-r--r--mac-x86/crypto/aes/aesni-x86.S793
-rw-r--r--mac-x86/crypto/bn/bn-586.S441
-rw-r--r--mac-x86/crypto/bn/x86-mont.S176
-rw-r--r--mac-x86/crypto/cpu-x86-asm.S28
-rw-r--r--mac-x86/crypto/sha/sha1-586.S1422
-rw-r--r--mac-x86/crypto/sha/sha256-586.S1647
-rw-r--r--mac-x86/crypto/sha/sha512-586.S2275
7 files changed, 6117 insertions, 665 deletions
diff --git a/mac-x86/crypto/aes/aesni-x86.S b/mac-x86/crypto/aes/aesni-x86.S
index 9000478..07719ba 100644
--- a/mac-x86/crypto/aes/aesni-x86.S
+++ b/mac-x86/crypto/aes/aesni-x86.S
@@ -22,7 +22,10 @@ L000enc1_loop_1:
leal 16(%edx),%edx
jnz L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.globl _aesni_decrypt
.private_extern _aesni_decrypt
@@ -45,7 +48,10 @@ L001dec1_loop_2:
leal 16(%edx),%edx
jnz L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.private_extern __aesni_encrypt2
.align 4
@@ -252,17 +258,15 @@ __aesni_encrypt6:
negl %ecx
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp L_aesni_encrypt6_enter
+ jmp L008_aesni_encrypt6_inner
.align 4,0x90
-L008enc6_loop:
+L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
@@ -276,7 +280,7 @@ L_aesni_encrypt6_enter:
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups -16(%edx,%ecx,1),%xmm0
- jnz L008enc6_loop
+ jnz L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -307,17 +311,15 @@ __aesni_decrypt6:
negl %ecx
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp L_aesni_decrypt6_enter
+ jmp L010_aesni_decrypt6_inner
.align 4,0x90
-L009dec6_loop:
+L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
@@ -331,7 +333,7 @@ L_aesni_decrypt6_enter:
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups -16(%edx,%ecx,1),%xmm0
- jnz L009dec6_loop
+ jnz L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -360,14 +362,14 @@ L_aesni_ecb_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz L010ecb_ret
+ jz L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz L011ecb_decrypt
+ jz L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L012ecb_enc_tail
+ jb L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -376,9 +378,9 @@ L_aesni_ecb_encrypt_begin:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L013ecb_enc_loop6_enter
+ jmp L015ecb_enc_loop6_enter
.align 4,0x90
-L014ecb_enc_loop6:
+L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -393,12 +395,12 @@ L014ecb_enc_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L013ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L014ecb_enc_loop6
+ jnc L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -407,18 +409,18 @@ L013ecb_enc_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L010ecb_ret
-L012ecb_enc_tail:
+ jz L012ecb_ret
+L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L015ecb_enc_one
+ jb L017ecb_enc_one
movups 16(%esi),%xmm3
- je L016ecb_enc_two
+ je L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L017ecb_enc_three
+ jb L019ecb_enc_three
movups 48(%esi),%xmm5
- je L018ecb_enc_four
+ je L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
@@ -427,49 +429,49 @@ L012ecb_enc_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L015ecb_enc_one:
+L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L019enc1_loop_3:
+L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L019enc1_loop_3
+ jnz L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L016ecb_enc_two:
+L018ecb_enc_two:
call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L017ecb_enc_three:
+L019ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L018ecb_enc_four:
+L020ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L011ecb_decrypt:
+L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L020ecb_dec_tail
+ jb L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -478,9 +480,9 @@ L011ecb_decrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L021ecb_dec_loop6_enter
+ jmp L023ecb_dec_loop6_enter
.align 4,0x90
-L022ecb_dec_loop6:
+L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -495,12 +497,12 @@ L022ecb_dec_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L021ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L022ecb_dec_loop6
+ jnc L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -509,18 +511,18 @@ L021ecb_dec_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L010ecb_ret
-L020ecb_dec_tail:
+ jz L012ecb_ret
+L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L023ecb_dec_one
+ jb L025ecb_dec_one
movups 16(%esi),%xmm3
- je L024ecb_dec_two
+ je L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L025ecb_dec_three
+ jb L027ecb_dec_three
movups 48(%esi),%xmm5
- je L026ecb_dec_four
+ je L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
@@ -529,43 +531,51 @@ L020ecb_dec_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L023ecb_dec_one:
+L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L027dec1_loop_4:
+L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L027dec1_loop_4
+ jnz L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L024ecb_dec_two:
+L026ecb_dec_two:
call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L025ecb_dec_three:
+L027ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L026ecb_dec_four:
+L028ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L010ecb_ret:
+L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -611,7 +621,7 @@ L_aesni_ccm64_encrypt_blocks_begin:
leal 32(%edx,%ecx,1),%edx
subl %ecx,%ebx
.byte 102,15,56,0,253
-L028ccm64_enc_outer:
+L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
@@ -620,7 +630,7 @@ L028ccm64_enc_outer:
xorps %xmm6,%xmm0
xorps %xmm0,%xmm3
movups 32(%ebp),%xmm0
-L029ccm64_enc2_loop:
+L031ccm64_enc2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -628,7 +638,7 @@ L029ccm64_enc2_loop:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz L029ccm64_enc2_loop
+ jnz L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
@@ -641,10 +651,18 @@ L029ccm64_enc2_loop:
movups %xmm6,(%edi)
.byte 102,15,56,0,213
leal 16(%edi),%edi
- jnz L028ccm64_enc_outer
+ jnz L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -691,12 +709,12 @@ L_aesni_ccm64_decrypt_blocks_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L030enc1_loop_5:
+L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L030enc1_loop_5
+ jnz L032enc1_loop_5
.byte 102,15,56,221,209
shll $4,%ebx
movl $16,%ecx
@@ -706,16 +724,16 @@ L030enc1_loop_5:
subl %ebx,%ecx
leal 32(%ebp,%ebx,1),%edx
movl %ecx,%ebx
- jmp L031ccm64_dec_outer
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L031ccm64_dec_outer:
+L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz L032ccm64_dec_break
+ jz L034ccm64_dec_break
movups (%ebp),%xmm0
movl %ebx,%ecx
movups 16(%ebp),%xmm1
@@ -723,7 +741,7 @@ L031ccm64_dec_outer:
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
movups 32(%ebp),%xmm0
-L033ccm64_dec2_loop:
+L035ccm64_dec2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -731,7 +749,7 @@ L033ccm64_dec2_loop:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz L033ccm64_dec2_loop
+ jnz L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
@@ -739,9 +757,9 @@ L033ccm64_dec2_loop:
.byte 102,15,56,221,208
.byte 102,15,56,221,216
leal 16(%esi),%esi
- jmp L031ccm64_dec_outer
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L032ccm64_dec_break:
+L034ccm64_dec_break:
movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
@@ -749,16 +767,24 @@ L032ccm64_dec_break:
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-L034enc1_loop_6:
+L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L034enc1_loop_6
+ jnz L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -783,7 +809,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je L035ctr32_one_shortcut
+ je L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -821,7 +847,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
pshufd $192,%xmm0,%xmm2
pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb L036ctr32_tail
+ jb L038ctr32_tail
pxor %xmm6,%xmm7
shll $4,%ecx
movl $16,%ebx
@@ -830,9 +856,9 @@ L_aesni_ctr32_encrypt_blocks_begin:
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp L037ctr32_loop6
+ jmp L039ctr32_loop6
.align 4,0x90
-L037ctr32_loop6:
+L039ctr32_loop6:
pshufd $64,%xmm0,%xmm4
movdqa 32(%esp),%xmm0
pshufd $192,%xmm1,%xmm5
@@ -886,27 +912,27 @@ L037ctr32_loop6:
leal 96(%edi),%edi
pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc L037ctr32_loop6
+ jnc L039ctr32_loop6
addl $6,%eax
- jz L038ctr32_ret
+ jz L040ctr32_ret
movdqu (%ebp),%xmm7
movl %ebp,%edx
pxor 32(%esp),%xmm7
movl 240(%ebp),%ecx
-L036ctr32_tail:
+L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb L039ctr32_one
+ jb L041ctr32_one
pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je L040ctr32_two
+ je L042ctr32_two
pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb L041ctr32_three
+ jb L043ctr32_three
pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je L042ctr32_four
+ je L044ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
@@ -924,29 +950,29 @@ L036ctr32_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L035ctr32_one_shortcut:
+L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-L039ctr32_one:
+L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L043enc1_loop_7:
+L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L043enc1_loop_7
+ jnz L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L040ctr32_two:
+L042ctr32_two:
call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -954,9 +980,9 @@ L040ctr32_two:
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L041ctr32_three:
+L043ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -967,9 +993,9 @@ L041ctr32_three:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L042ctr32_four:
+L044ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -983,7 +1009,18 @@ L042ctr32_four:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L038ctr32_ret:
+L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -1007,12 +1044,12 @@ L_aesni_xts_encrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L044enc1_loop_8:
+L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L044enc1_loop_8
+ jnz L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1036,14 +1073,14 @@ L044enc1_loop_8:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc L045xts_enc_short
+ jc L047xts_enc_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp L046xts_enc_loop6
+ jmp L048xts_enc_loop6
.align 4,0x90
-L046xts_enc_loop6:
+L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1132,23 +1169,23 @@ L046xts_enc_loop6:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L046xts_enc_loop6
+ jnc L048xts_enc_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L045xts_enc_short:
+L047xts_enc_short:
addl $96,%eax
- jz L047xts_enc_done6x
+ jz L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L048xts_enc_one
+ jb L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L049xts_enc_two
+ je L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1157,7 +1194,7 @@ L045xts_enc_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L050xts_enc_three
+ jb L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1167,7 +1204,7 @@ L045xts_enc_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L051xts_enc_four
+ je L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1199,9 +1236,9 @@ L045xts_enc_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L048xts_enc_one:
+L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1209,20 +1246,20 @@ L048xts_enc_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L053enc1_loop_9:
+L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L053enc1_loop_9
+ jnz L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L049xts_enc_two:
+L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1236,9 +1273,9 @@ L049xts_enc_two:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L050xts_enc_three:
+L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1256,9 +1293,9 @@ L050xts_enc_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L051xts_enc_four:
+L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1280,28 +1317,28 @@ L051xts_enc_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L047xts_enc_done6x:
+L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L054xts_enc_ret
+ jz L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp L055xts_enc_steal
+ jmp L057xts_enc_steal
.align 4,0x90
-L052xts_enc_done:
+L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L054xts_enc_ret
+ jz L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-L055xts_enc_steal:
+L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1309,7 +1346,7 @@ L055xts_enc_steal:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L055xts_enc_steal
+ jnz L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1319,16 +1356,30 @@ L055xts_enc_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L056enc1_loop_10:
+L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L056enc1_loop_10
+ jnz L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-L054xts_enc_ret:
+L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1352,12 +1403,12 @@ L_aesni_xts_decrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L057enc1_loop_11:
+L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L057enc1_loop_11
+ jnz L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1386,14 +1437,14 @@ L057enc1_loop_11:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc L058xts_dec_short
+ jc L060xts_dec_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp L059xts_dec_loop6
+ jmp L061xts_dec_loop6
.align 4,0x90
-L059xts_dec_loop6:
+L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1482,23 +1533,23 @@ L059xts_dec_loop6:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L059xts_dec_loop6
+ jnc L061xts_dec_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L058xts_dec_short:
+L060xts_dec_short:
addl $96,%eax
- jz L060xts_dec_done6x
+ jz L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L061xts_dec_one
+ jb L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L062xts_dec_two
+ je L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1507,7 +1558,7 @@ L058xts_dec_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L063xts_dec_three
+ jb L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1517,7 +1568,7 @@ L058xts_dec_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L064xts_dec_four
+ je L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1549,9 +1600,9 @@ L058xts_dec_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L061xts_dec_one:
+L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1559,20 +1610,20 @@ L061xts_dec_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L066dec1_loop_12:
+L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L066dec1_loop_12
+ jnz L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L062xts_dec_two:
+L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1586,9 +1637,9 @@ L062xts_dec_two:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L063xts_dec_three:
+L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1606,9 +1657,9 @@ L063xts_dec_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L064xts_dec_four:
+L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1630,20 +1681,20 @@ L064xts_dec_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L060xts_dec_done6x:
+L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L067xts_dec_ret
+ jz L069xts_dec_ret
movl %eax,112(%esp)
- jmp L068xts_dec_only_one_more
+ jmp L070xts_dec_only_one_more
.align 4,0x90
-L065xts_dec_done:
+L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L067xts_dec_ret
+ jz L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1653,7 +1704,7 @@ L065xts_dec_done:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-L068xts_dec_only_one_more:
+L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1667,16 +1718,16 @@ L068xts_dec_only_one_more:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L069dec1_loop_13:
+L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L069dec1_loop_13
+ jnz L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-L070xts_dec_steal:
+L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1684,7 +1735,7 @@ L070xts_dec_steal:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L070xts_dec_steal
+ jnz L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1694,16 +1745,30 @@ L070xts_dec_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L071dec1_loop_14:
+L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L071dec1_loop_14
+ jnz L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-L067xts_dec_ret:
+L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1728,7 +1793,7 @@ L_aesni_cbc_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz L072cbc_abort
+ jz L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1736,14 +1801,14 @@ L_aesni_cbc_encrypt_begin:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je L073cbc_decrypt
+ je L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb L074cbc_enc_tail
+ jb L076cbc_enc_tail
subl $16,%eax
- jmp L075cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L075cbc_enc_loop:
+L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1751,24 +1816,25 @@ L075cbc_enc_loop:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-L076enc1_loop_15:
+L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L076enc1_loop_15
+ jnz L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc L075cbc_enc_loop
+ jnc L077cbc_enc_loop
addl $16,%eax
- jnz L074cbc_enc_tail
+ jnz L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp L077cbc_ret
-L074cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp L079cbc_ret
+L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1779,20 +1845,20 @@ L074cbc_enc_tail:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp L075cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L073cbc_decrypt:
+L075cbc_decrypt:
cmpl $80,%eax
- jbe L078cbc_dec_tail
+ jbe L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp L079cbc_dec_loop6_enter
+ jmp L081cbc_dec_loop6_enter
.align 4,0x90
-L080cbc_dec_loop6:
+L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-L079cbc_dec_loop6_enter:
+L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1822,28 +1888,28 @@ L079cbc_dec_loop6_enter:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja L080cbc_dec_loop6
+ ja L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle L081cbc_dec_tail_collected
+ jle L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-L078cbc_dec_tail:
+L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe L082cbc_dec_one
+ jbe L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe L083cbc_dec_two
+ jbe L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe L084cbc_dec_three
+ jbe L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe L085cbc_dec_four
+ jbe L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1861,55 +1927,62 @@ L078cbc_dec_tail:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L082cbc_dec_one:
+L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L086dec1_loop_16:
+L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L086dec1_loop_16
+ jnz L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L083cbc_dec_two:
+L085cbc_dec_two:
call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L084cbc_dec_three:
+L086cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L085cbc_dec_four:
+L087cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1919,28 +1992,44 @@ L085cbc_dec_four:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-L081cbc_dec_tail_collected:
+ jmp L088cbc_dec_tail_collected
+.align 4,0x90
+L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+L088cbc_dec_tail_collected:
andl $15,%eax
- jnz L087cbc_dec_tail_partial
+ jnz L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp L077cbc_ret
+ pxor %xmm0,%xmm0
+ jmp L079cbc_ret
.align 4,0x90
-L087cbc_dec_tail_partial:
+L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-L077cbc_ret:
+ movdqa %xmm2,(%esp)
+L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-L072cbc_abort:
+ pxor %xmm7,%xmm7
+L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1949,52 +2038,62 @@ L072cbc_abort:
.private_extern __aesni_set_encrypt_key
.align 4
__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz L088bad_pointer
+ jz L091bad_pointer
testl %edx,%edx
- jz L088bad_pointer
+ jz L091bad_pointer
+ call L092pic
+L092pic:
+ popl %ebx
+ leal Lkey_const-L092pic(%ebx),%ebx
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je L08914rounds
+ je L09314rounds
cmpl $192,%ecx
- je L09012rounds
+ je L09412rounds
cmpl $128,%ecx
- jne L091bad_keybits
+ jne L095bad_keybits
.align 4,0x90
-L09210rounds:
+L09610rounds:
+ cmpl $268435456,%ebp
+ je L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call L093key_128_cold
+ call L098key_128_cold
.byte 102,15,58,223,200,2
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,4
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,8
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,16
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,32
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,64
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,128
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,27
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,54
- call L094key_128
+ call L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L094key_128:
+L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-L093key_128_cold:
+L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2003,38 +2102,91 @@ L093key_128_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L09012rounds:
+L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call L095key_192a_cold
+ call L103key_192a_cold
.byte 102,15,58,223,202,2
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,4
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,8
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,16
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,32
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,64
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,128
- call L096key_192b
+ call L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L097key_192a:
+L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 4,0x90
-L095key_192a_cold:
+L103key_192a_cold:
movaps %xmm2,%xmm5
-L098key_192b_warm:
+L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2048,56 +2200,90 @@ L098key_192b_warm:
pxor %xmm3,%xmm2
ret
.align 4,0x90
-L096key_192b:
+L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp L098key_192b_warm
+ jmp L106key_192b_warm
.align 4,0x90
-L08914rounds:
+L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call L099key_256a_cold
+ call L109key_256a_cold
.byte 102,15,58,223,200,1
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,2
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,2
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,4
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,4
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,8
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,8
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,16
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,16
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,32
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,32
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,64
- call L101key_256a
+ call L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L101key_256a:
+L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-L099key_256a_cold:
+L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2106,7 +2292,7 @@ L099key_256a_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L100key_256b:
+L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2116,13 +2302,70 @@ L100key_256b:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 4,0x90
+L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp L112loop_key256
+L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 2,0x90
-L088bad_pointer:
+L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 2,0x90
-L091bad_keybits:
+L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.globl _aesni_set_encrypt_key
.private_extern _aesni_set_encrypt_key
@@ -2146,7 +2389,7 @@ L_aesni_set_decrypt_key_begin:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz L102dec_key_ret
+ jnz L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2154,7 +2397,7 @@ L_aesni_set_decrypt_key_begin:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-L103dec_key_inverse:
+L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2164,15 +2407,27 @@ L103dec_key_inverse:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja L103dec_key_inverse
+ ja L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-L102dec_key_ret:
+L114dec_key_ret:
ret
+.align 6,0x90
+Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif
diff --git a/mac-x86/crypto/bn/bn-586.S b/mac-x86/crypto/bn/bn-586.S
index 34cf56f..0f0a94e 100644
--- a/mac-x86/crypto/bn/bn-586.S
+++ b/mac-x86/crypto/bn/bn-586.S
@@ -6,6 +6,102 @@
.align 4
_bn_mul_add_words:
L_bn_mul_add_words_begin:
+ call L000PIC_me_up
+L000PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp L002maw_sse2_entry
+.align 4,0x90
+L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz L004maw_sse2_exit
+L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz L003maw_sse2_unrolled
+.align 2,0x90
+L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz L005maw_sse2_loop
+L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 4,0x90
+L001maw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -18,9 +114,9 @@ L_bn_mul_add_words_begin:
andl $4294967288,%ecx
movl 32(%esp),%ebp
pushl %ecx
- jz L000maw_finish
+ jz L006maw_finish
.align 4,0x90
-L001maw_loop:
+L007maw_loop:
# Round 0
movl (%ebx),%eax
mull %ebp
@@ -97,13 +193,13 @@ L001maw_loop:
subl $8,%ecx
leal 32(%ebx),%ebx
leal 32(%edi),%edi
- jnz L001maw_loop
-L000maw_finish:
+ jnz L007maw_loop
+L006maw_finish:
movl 32(%esp),%ecx
andl $7,%ecx
- jnz L002maw_finish2
- jmp L003maw_end
-L002maw_finish2:
+ jnz L008maw_finish2
+ jmp L009maw_end
+L008maw_finish2:
# Tail Round 0
movl (%ebx),%eax
mull %ebp
@@ -114,7 +210,7 @@ L002maw_finish2:
decl %ecx
movl %eax,(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 1
movl 4(%ebx),%eax
mull %ebp
@@ -125,7 +221,7 @@ L002maw_finish2:
decl %ecx
movl %eax,4(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 2
movl 8(%ebx),%eax
mull %ebp
@@ -136,7 +232,7 @@ L002maw_finish2:
decl %ecx
movl %eax,8(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 3
movl 12(%ebx),%eax
mull %ebp
@@ -147,7 +243,7 @@ L002maw_finish2:
decl %ecx
movl %eax,12(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 4
movl 16(%ebx),%eax
mull %ebp
@@ -158,7 +254,7 @@ L002maw_finish2:
decl %ecx
movl %eax,16(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 5
movl 20(%ebx),%eax
mull %ebp
@@ -169,7 +265,7 @@ L002maw_finish2:
decl %ecx
movl %eax,20(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 6
movl 24(%ebx),%eax
mull %ebp
@@ -179,7 +275,7 @@ L002maw_finish2:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-L003maw_end:
+L009maw_end:
movl %esi,%eax
popl %ecx
popl %edi
@@ -192,6 +288,33 @@ L003maw_end:
.align 4
_bn_mul_words:
L_bn_mul_words_begin:
+ call L010PIC_me_up
+L010PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 4,0x90
+L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 4,0x90
+L011mw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -203,8 +326,8 @@ L_bn_mul_words_begin:
movl 28(%esp),%ebp
movl 32(%esp),%ecx
andl $4294967288,%ebp
- jz L004mw_finish
-L005mw_loop:
+ jz L013mw_finish
+L014mw_loop:
# Round 0
movl (%ebx),%eax
mull %ecx
@@ -265,14 +388,14 @@ L005mw_loop:
addl $32,%ebx
addl $32,%edi
subl $8,%ebp
- jz L004mw_finish
- jmp L005mw_loop
-L004mw_finish:
+ jz L013mw_finish
+ jmp L014mw_loop
+L013mw_finish:
movl 28(%esp),%ebp
andl $7,%ebp
- jnz L006mw_finish2
- jmp L007mw_end
-L006mw_finish2:
+ jnz L015mw_finish2
+ jmp L016mw_end
+L015mw_finish2:
# Tail Round 0
movl (%ebx),%eax
mull %ecx
@@ -281,7 +404,7 @@ L006mw_finish2:
movl %eax,(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 1
movl 4(%ebx),%eax
mull %ecx
@@ -290,7 +413,7 @@ L006mw_finish2:
movl %eax,4(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 2
movl 8(%ebx),%eax
mull %ecx
@@ -299,7 +422,7 @@ L006mw_finish2:
movl %eax,8(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 3
movl 12(%ebx),%eax
mull %ecx
@@ -308,7 +431,7 @@ L006mw_finish2:
movl %eax,12(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 4
movl 16(%ebx),%eax
mull %ecx
@@ -317,7 +440,7 @@ L006mw_finish2:
movl %eax,16(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 5
movl 20(%ebx),%eax
mull %ecx
@@ -326,7 +449,7 @@ L006mw_finish2:
movl %eax,20(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 6
movl 24(%ebx),%eax
mull %ecx
@@ -334,7 +457,7 @@ L006mw_finish2:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-L007mw_end:
+L016mw_end:
movl %esi,%eax
popl %edi
popl %esi
@@ -346,6 +469,28 @@ L007mw_end:
.align 4
_bn_sqr_words:
L_bn_sqr_words_begin:
+ call L017PIC_me_up
+L017PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L018sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 4,0x90
+L019sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz L019sqr_sse2_loop
+ emms
+ ret
+.align 4,0x90
+L018sqr_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -355,8 +500,8 @@ L_bn_sqr_words_begin:
movl 24(%esp),%edi
movl 28(%esp),%ebx
andl $4294967288,%ebx
- jz L008sw_finish
-L009sw_loop:
+ jz L020sw_finish
+L021sw_loop:
# Round 0
movl (%edi),%eax
mull %eax
@@ -401,59 +546,59 @@ L009sw_loop:
addl $32,%edi
addl $64,%esi
subl $8,%ebx
- jnz L009sw_loop
-L008sw_finish:
+ jnz L021sw_loop
+L020sw_finish:
movl 28(%esp),%ebx
andl $7,%ebx
- jz L010sw_end
+ jz L022sw_end
# Tail Round 0
movl (%edi),%eax
mull %eax
movl %eax,(%esi)
decl %ebx
movl %edx,4(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 1
movl 4(%edi),%eax
mull %eax
movl %eax,8(%esi)
decl %ebx
movl %edx,12(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 2
movl 8(%edi),%eax
mull %eax
movl %eax,16(%esi)
decl %ebx
movl %edx,20(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 3
movl 12(%edi),%eax
mull %eax
movl %eax,24(%esi)
decl %ebx
movl %edx,28(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 4
movl 16(%edi),%eax
mull %eax
movl %eax,32(%esi)
decl %ebx
movl %edx,36(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 5
movl 20(%edi),%eax
mull %eax
movl %eax,40(%esi)
decl %ebx
movl %edx,44(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 6
movl 24(%edi),%eax
mull %eax
movl %eax,48(%esi)
movl %edx,52(%esi)
-L010sw_end:
+L022sw_end:
popl %edi
popl %esi
popl %ebx
@@ -485,8 +630,8 @@ L_bn_add_words_begin:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L011aw_finish
-L012aw_loop:
+ jz L023aw_finish
+L024aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -564,11 +709,11 @@ L012aw_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L012aw_loop
-L011aw_finish:
+ jnz L024aw_loop
+L023aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L013aw_end
+ jz L025aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -579,7 +724,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 1
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -590,7 +735,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 2
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -601,7 +746,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 3
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -612,7 +757,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 4
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -623,7 +768,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 5
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -634,7 +779,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 6
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -644,7 +789,7 @@ L011aw_finish:
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-L013aw_end:
+L025aw_end:
popl %edi
popl %esi
popl %ebx
@@ -666,8 +811,8 @@ L_bn_sub_words_begin:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L014aw_finish
-L015aw_loop:
+ jz L026aw_finish
+L027aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -745,11 +890,11 @@ L015aw_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L015aw_loop
-L014aw_finish:
+ jnz L027aw_loop
+L026aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L016aw_end
+ jz L028aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -760,7 +905,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 1
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -771,7 +916,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 2
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -782,7 +927,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 3
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -793,7 +938,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 4
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -804,7 +949,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 5
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -815,7 +960,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 6
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -825,7 +970,7 @@ L014aw_finish:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-L016aw_end:
+L028aw_end:
popl %edi
popl %esi
popl %ebx
@@ -847,8 +992,8 @@ L_bn_sub_part_words_begin:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L017aw_finish
-L018aw_loop:
+ jz L029aw_finish
+L030aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -926,11 +1071,11 @@ L018aw_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L018aw_loop
-L017aw_finish:
+ jnz L030aw_loop
+L029aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -944,7 +1089,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 1
movl (%esi),%ecx
movl (%edi),%edx
@@ -958,7 +1103,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 2
movl (%esi),%ecx
movl (%edi),%edx
@@ -972,7 +1117,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 3
movl (%esi),%ecx
movl (%edi),%edx
@@ -986,7 +1131,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 4
movl (%esi),%ecx
movl (%edi),%edx
@@ -1000,7 +1145,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 5
movl (%esi),%ecx
movl (%edi),%edx
@@ -1014,7 +1159,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 6
movl (%esi),%ecx
movl (%edi),%edx
@@ -1027,20 +1172,20 @@ L017aw_finish:
addl $4,%esi
addl $4,%edi
addl $4,%ebx
-L019aw_end:
+L031aw_end:
cmpl $0,36(%esp)
- je L020pw_end
+ je L032pw_end
movl 36(%esp),%ebp
cmpl $0,%ebp
- je L020pw_end
- jge L021pw_pos
+ je L032pw_end
+ jge L033pw_pos
# pw_neg
movl $0,%edx
subl %ebp,%edx
movl %edx,%ebp
andl $4294967288,%ebp
- jz L022pw_neg_finish
-L023pw_neg_loop:
+ jz L034pw_neg_finish
+L035pw_neg_loop:
# dl<0 Round 0
movl $0,%ecx
movl (%edi),%edx
@@ -1117,13 +1262,13 @@ L023pw_neg_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L023pw_neg_loop
-L022pw_neg_finish:
+ jnz L035pw_neg_loop
+L034pw_neg_finish:
movl 36(%esp),%edx
movl $0,%ebp
subl %edx,%ebp
andl $7,%ebp
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 0
movl $0,%ecx
movl (%edi),%edx
@@ -1134,7 +1279,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 1
movl $0,%ecx
movl 4(%edi),%edx
@@ -1145,7 +1290,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 2
movl $0,%ecx
movl 8(%edi),%edx
@@ -1156,7 +1301,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 3
movl $0,%ecx
movl 12(%edi),%edx
@@ -1167,7 +1312,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 4
movl $0,%ecx
movl 16(%edi),%edx
@@ -1178,7 +1323,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 5
movl $0,%ecx
movl 20(%edi),%edx
@@ -1189,7 +1334,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 6
movl $0,%ecx
movl 24(%edi),%edx
@@ -1199,181 +1344,185 @@ L022pw_neg_finish:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
- jmp L020pw_end
-L021pw_pos:
+ jmp L032pw_end
+L033pw_pos:
andl $4294967288,%ebp
- jz L024pw_pos_finish
-L025pw_pos_loop:
+ jz L036pw_pos_finish
+L037pw_pos_loop:
# dl>0 Round 0
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc L026pw_nc0
+ jnc L038pw_nc0
# dl>0 Round 1
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc L027pw_nc1
+ jnc L039pw_nc1
# dl>0 Round 2
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc L028pw_nc2
+ jnc L040pw_nc2
# dl>0 Round 3
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc L029pw_nc3
+ jnc L041pw_nc3
# dl>0 Round 4
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc L030pw_nc4
+ jnc L042pw_nc4
# dl>0 Round 5
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc L031pw_nc5
+ jnc L043pw_nc5
# dl>0 Round 6
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc L032pw_nc6
+ jnc L044pw_nc6
# dl>0 Round 7
movl 28(%esi),%ecx
subl %eax,%ecx
movl %ecx,28(%ebx)
- jnc L033pw_nc7
+ jnc L045pw_nc7
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz L025pw_pos_loop
-L024pw_pos_finish:
+ jnz L037pw_pos_loop
+L036pw_pos_finish:
movl 36(%esp),%ebp
andl $7,%ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 0
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc L034pw_tail_nc0
+ jnc L046pw_tail_nc0
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 1
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc L035pw_tail_nc1
+ jnc L047pw_tail_nc1
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 2
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc L036pw_tail_nc2
+ jnc L048pw_tail_nc2
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 3
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc L037pw_tail_nc3
+ jnc L049pw_tail_nc3
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 4
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc L038pw_tail_nc4
+ jnc L050pw_tail_nc4
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 5
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc L039pw_tail_nc5
+ jnc L051pw_tail_nc5
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 6
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc L040pw_tail_nc6
+ jnc L052pw_tail_nc6
movl $1,%eax
- jmp L020pw_end
-L041pw_nc_loop:
+ jmp L032pw_end
+L053pw_nc_loop:
movl (%esi),%ecx
movl %ecx,(%ebx)
-L026pw_nc0:
+L038pw_nc0:
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-L027pw_nc1:
+L039pw_nc1:
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-L028pw_nc2:
+L040pw_nc2:
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-L029pw_nc3:
+L041pw_nc3:
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-L030pw_nc4:
+L042pw_nc4:
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-L031pw_nc5:
+L043pw_nc5:
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-L032pw_nc6:
+L044pw_nc6:
movl 28(%esi),%ecx
movl %ecx,28(%ebx)
-L033pw_nc7:
+L045pw_nc7:
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz L041pw_nc_loop
+ jnz L053pw_nc_loop
movl 36(%esp),%ebp
andl $7,%ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl (%esi),%ecx
movl %ecx,(%ebx)
-L034pw_tail_nc0:
+L046pw_tail_nc0:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-L035pw_tail_nc1:
+L047pw_tail_nc1:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-L036pw_tail_nc2:
+L048pw_tail_nc2:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-L037pw_tail_nc3:
+L049pw_tail_nc3:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-L038pw_tail_nc4:
+L050pw_tail_nc4:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-L039pw_tail_nc5:
+L051pw_tail_nc5:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-L040pw_tail_nc6:
-L042pw_nc_end:
+L052pw_tail_nc6:
+L054pw_nc_end:
movl $0,%eax
-L020pw_end:
+L032pw_end:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif
diff --git a/mac-x86/crypto/bn/x86-mont.S b/mac-x86/crypto/bn/x86-mont.S
index 1b79c5f..234034b 100644
--- a/mac-x86/crypto/bn/x86-mont.S
+++ b/mac-x86/crypto/bn/x86-mont.S
@@ -43,6 +43,126 @@ L_bn_mul_mont_begin:
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %ebp,24(%esp)
+ call L001PIC_me_up
+L001PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L002non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 4,0x90
+L0031st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl L0031st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+L004outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+L005inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz L005inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle L004outer
+ emms
+ jmp L006common_tail
+.align 4,0x90
+L002non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
@@ -53,12 +173,12 @@ L_bn_mul_mont_begin:
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
- jz L001bn_sqr_mont
+ jz L007bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 4,0x90
-L002mull:
+L008mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
@@ -67,7 +187,7 @@ L002mull:
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl L002mull
+ jl L008mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
@@ -85,9 +205,9 @@ L002mull:
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
- jmp L0032ndmadd
+ jmp L0092ndmadd
.align 4,0x90
-L0041stmadd:
+L0101stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -98,7 +218,7 @@ L0041stmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl L0041stmadd
+ jl L0101stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
@@ -121,7 +241,7 @@ L0041stmadd:
adcl $0,%edx
movl $1,%ecx
.align 4,0x90
-L0032ndmadd:
+L0092ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -132,7 +252,7 @@ L0032ndmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl L0032ndmadd
+ jl L0092ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -148,16 +268,16 @@ L0032ndmadd:
movl %edx,32(%esp,%ebx,4)
cmpl 28(%esp),%ecx
movl %eax,36(%esp,%ebx,4)
- je L005common_tail
+ je L006common_tail
movl (%ecx),%edi
movl 8(%esp),%esi
movl %ecx,12(%esp)
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
- jmp L0041stmadd
+ jmp L0101stmadd
.align 4,0x90
-L001bn_sqr_mont:
+L007bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
@@ -168,7 +288,7 @@ L001bn_sqr_mont:
andl $1,%ebx
incl %ecx
.align 4,0x90
-L006sqr:
+L011sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -180,7 +300,7 @@ L006sqr:
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
- jl L006sqr
+ jl L011sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -204,7 +324,7 @@ L006sqr:
movl 4(%esi),%eax
movl $1,%ecx
.align 4,0x90
-L0073rdmadd:
+L0123rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -223,7 +343,7 @@ L0073rdmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl L0073rdmadd
+ jl L0123rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -239,7 +359,7 @@ L0073rdmadd:
movl %edx,32(%esp,%ebx,4)
cmpl %ebx,%ecx
movl %eax,36(%esp,%ebx,4)
- je L005common_tail
+ je L006common_tail
movl 4(%esi,%ecx,4),%edi
leal 1(%ecx),%ecx
movl %edi,%eax
@@ -251,12 +371,12 @@ L0073rdmadd:
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
- je L008sqrlast
+ je L013sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 4,0x90
-L009sqradd:
+L014sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -272,13 +392,13 @@ L009sqradd:
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
- jle L009sqradd
+ jle L014sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
-L008sqrlast:
+L013sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
@@ -293,9 +413,9 @@ L008sqrlast:
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
- jmp L0073rdmadd
+ jmp L0123rdmadd
.align 4,0x90
-L005common_tail:
+L006common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -303,16 +423,16 @@ L005common_tail:
movl %ebx,%ecx
xorl %edx,%edx
.align 4,0x90
-L010sub:
+L015sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge L010sub
+ jge L015sub
sbbl $0,%eax
.align 4,0x90
-L011copy:
+L016copy:
movl (%esi,%ebx,4),%edx
movl (%edi,%ebx,4),%ebp
xorl %ebp,%edx
@@ -321,7 +441,7 @@ L011copy:
movl %ecx,(%esi,%ebx,4)
movl %edx,(%edi,%ebx,4)
decl %ebx
- jge L011copy
+ jge L016copy
movl 24(%esp),%esp
movl $1,%eax
L000just_leave:
@@ -335,4 +455,8 @@ L000just_leave:
.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
.byte 111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif
diff --git a/mac-x86/crypto/cpu-x86-asm.S b/mac-x86/crypto/cpu-x86-asm.S
index 7e8c83a..bfb292c 100644
--- a/mac-x86/crypto/cpu-x86-asm.S
+++ b/mac-x86/crypto/cpu-x86-asm.S
@@ -100,10 +100,6 @@ L004nocacheinfo:
cmpl $0,%ebp
jne L005notintel
orl $1073741824,%edx
- andb $15,%ah
- cmpb $15,%ah
- jne L005notintel
- orl $1048576,%edx
L005notintel:
btl $28,%edx
jnc L002generic
@@ -232,6 +228,18 @@ L015PIC_me_up:
movl (%ecx),%ecx
btl $1,(%ecx)
jnc L016no_x87
+ andl $83886080,%ecx
+ cmpl $83886080,%ecx
+ jne L017no_sse2
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+L017no_sse2:
.long 4007259865,4007259865,4007259865,4007259865,2430851995
L016no_x87:
leal 4(%esp),%eax
@@ -246,11 +254,11 @@ L_OPENSSL_atomic_add_begin:
pushl %ebx
nop
movl (%edx),%eax
-L017spin:
+L018spin:
leal (%eax,%ecx,1),%ebx
nop
.long 447811568
- jne L017spin
+ jne L018spin
movl %ebx,%eax
popl %ebx
ret
@@ -286,11 +294,11 @@ L_OPENSSL_indirect_call_begin:
_OPENSSL_ia32_rdrand:
L_OPENSSL_ia32_rdrand_begin:
movl $8,%ecx
-L018loop:
+L019loop:
.byte 15,199,240
- jc L019break
- loop L018loop
-L019break:
+ jc L020break
+ loop L019loop
+L020break:
cmpl $0,%eax
cmovel %ecx,%eax
ret
diff --git a/mac-x86/crypto/sha/sha1-586.S b/mac-x86/crypto/sha/sha1-586.S
index 4dd3fee..97aafbf 100644
--- a/mac-x86/crypto/sha/sha1-586.S
+++ b/mac-x86/crypto/sha/sha1-586.S
@@ -10,6 +10,23 @@ L_sha1_block_data_order_begin:
pushl %ebx
pushl %esi
pushl %edi
+ call L000pic_point
+L000pic_point:
+ popl %ebp
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000pic_point(%ebp),%esi
+ leal LK_XX_XX-L000pic_point(%ebp),%ebp
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ testl $512,%edx
+ jz L001x86
+ movl 8(%esi),%ecx
+ testl $16777216,%eax
+ jz L001x86
+ testl $536870912,%ecx
+ jnz Lshaext_shortcut
+ jmp Lssse3_shortcut
+.align 4,0x90
+L001x86:
movl 20(%esp),%ebp
movl 24(%esp),%esi
movl 28(%esp),%eax
@@ -18,9 +35,9 @@ L_sha1_block_data_order_begin:
addl %esi,%eax
movl %eax,104(%esp)
movl 16(%ebp),%edi
- jmp L000loop
+ jmp L002loop
.align 4,0x90
-L000loop:
+L002loop:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -1367,15 +1384,1414 @@ L000loop:
movl %ebx,12(%ebp)
movl %edx,%esi
movl %ecx,16(%ebp)
- jb L000loop
+ jb L002loop
addl $76,%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
+.private_extern __sha1_block_data_order_shaext
+.align 4
+__sha1_block_data_order_shaext:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call L003pic_point
+L003pic_point:
+ popl %ebp
+ leal LK_XX_XX-L003pic_point(%ebp),%ebp
+Lshaext_shortcut:
+ movl 20(%esp),%edi
+ movl %esp,%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ subl $32,%esp
+ movdqu (%edi),%xmm0
+ movd 16(%edi),%xmm1
+ andl $-32,%esp
+ movdqa 80(%ebp),%xmm3
+ movdqu (%esi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%esi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+.byte 102,15,56,0,251
+ jmp L004loop_shaext
+.align 4,0x90
+L004loop_shaext:
+ decl %ecx
+ leal 64(%esi),%eax
+ movdqa %xmm1,(%esp)
+ paddd %xmm4,%xmm1
+ cmovnel %eax,%esi
+ movdqa %xmm0,16(%esp)
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%esi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%esi),%xmm5
+.byte 102,15,56,0,227
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,235
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,243
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+ movdqa (%esp),%xmm2
+.byte 102,15,56,0,251
+.byte 15,56,200,202
+ paddd 16(%esp),%xmm0
+ jnz L004loop_shaext
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%edi)
+ movd %xmm1,16(%edi)
+ movl %ebx,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.private_extern __sha1_block_data_order_ssse3
+.align 4
+__sha1_block_data_order_ssse3:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call L005pic_point
+L005pic_point:
+ popl %ebp
+ leal LK_XX_XX-L005pic_point(%ebp),%ebp
+Lssse3_shortcut:
+ movdqa (%ebp),%xmm7
+ movdqa 16(%ebp),%xmm0
+ movdqa 32(%ebp),%xmm1
+ movdqa 48(%ebp),%xmm2
+ movdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ movdqa %xmm0,112(%esp)
+ movdqa %xmm1,128(%esp)
+ movdqa %xmm2,144(%esp)
+ shll $6,%edx
+ movdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ movdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ movdqu -64(%ebp),%xmm0
+ movdqu -48(%ebp),%xmm1
+ movdqu -32(%ebp),%xmm2
+ movdqu -16(%ebp),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ movdqa %xmm7,96(%esp)
+.byte 102,15,56,0,222
+ paddd %xmm7,%xmm0
+ paddd %xmm7,%xmm1
+ paddd %xmm7,%xmm2
+ movdqa %xmm0,(%esp)
+ psubd %xmm7,%xmm0
+ movdqa %xmm1,16(%esp)
+ psubd %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ xorl %edx,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebp,%esi
+ jmp L006loop
+.align 4,0x90
+L006loop:
+ rorl $2,%ebx
+ xorl %edx,%esi
+ movl %eax,%ebp
+ punpcklqdq %xmm1,%xmm4
+ movdqa %xmm3,%xmm6
+ addl (%esp),%edi
+ xorl %ecx,%ebx
+ paddd %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+ roll $5,%eax
+ addl %esi,%edi
+ psrldq $4,%xmm6
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm2,%xmm6
+ xorl %ecx,%ebp
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ pxor %xmm6,%xmm4
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ movdqa %xmm4,%xmm6
+ xorl %ebx,%esi
+ pslldq $12,%xmm0
+ paddd %xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ psrld $31,%xmm6
+ xorl %eax,%edi
+ roll $5,%edx
+ movdqa %xmm0,%xmm7
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ psrld $30,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm6,%xmm4
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ pslld $2,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ pxor %xmm0,%xmm4
+ movdqa 96(%esp),%xmm0
+ addl %ebp,%ebx
+ andl %edx,%esi
+ pxor %xmm7,%xmm4
+ pshufd $238,%xmm1,%xmm5
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ movl %ebx,%ebp
+ punpcklqdq %xmm2,%xmm5
+ movdqa %xmm4,%xmm7
+ addl 16(%esp),%eax
+ xorl %edx,%ecx
+ paddd %xmm4,%xmm0
+ movdqa %xmm1,80(%esp)
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm7
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm7
+ xorl %edx,%ebp
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ pxor %xmm7,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm0,(%esp)
+ addl %ebp,%edi
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ movdqa %xmm5,%xmm7
+ xorl %ecx,%esi
+ pslldq $12,%xmm1
+ paddd %xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ psrld $31,%xmm7
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ psrld $30,%xmm1
+ addl %edi,%edx
+ rorl $7,%edi
+ por %xmm7,%xmm5
+ xorl %ebx,%ebp
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ pslld $2,%xmm0
+ xorl %eax,%edi
+ roll $5,%edx
+ pxor %xmm1,%xmm5
+ movdqa 112(%esp),%xmm1
+ addl %ebp,%ecx
+ andl %edi,%esi
+ pxor %xmm0,%xmm5
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ punpcklqdq %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ addl 32(%esp),%ebx
+ xorl %edi,%edx
+ paddd %xmm5,%xmm1
+ movdqa %xmm2,96(%esp)
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm0
+ andl %edx,%ebp
+ xorl %edi,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm0
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ pxor %xmm0,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm1,16(%esp)
+ addl %ebp,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm0
+ xorl %edx,%esi
+ pslldq $12,%xmm2
+ paddd %xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ psrld $31,%xmm0
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm2,%xmm1
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ psrld $30,%xmm2
+ addl %eax,%edi
+ rorl $7,%eax
+ por %xmm0,%xmm6
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm0
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ pslld $2,%xmm1
+ xorl %ebx,%eax
+ roll $5,%edi
+ pxor %xmm2,%xmm6
+ movdqa 112(%esp),%xmm2
+ addl %ebp,%edx
+ andl %eax,%esi
+ pxor %xmm1,%xmm6
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%esi
+ movl %edx,%ebp
+ punpcklqdq %xmm4,%xmm7
+ movdqa %xmm6,%xmm1
+ addl 48(%esp),%ecx
+ xorl %eax,%edi
+ paddd %xmm6,%xmm2
+ movdqa %xmm3,64(%esp)
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm1
+ andl %edi,%ebp
+ xorl %eax,%edi
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm1
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ pxor %xmm1,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ movdqa %xmm2,32(%esp)
+ addl %ebp,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm1
+ xorl %edi,%esi
+ pslldq $12,%xmm3
+ paddd %xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ psrld $31,%xmm1
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm3,%xmm2
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ psrld $30,%xmm3
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm1,%xmm7
+ xorl %edx,%ebp
+ movdqa 80(%esp),%xmm1
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ pslld $2,%xmm2
+ xorl %ecx,%ebx
+ roll $5,%eax
+ pxor %xmm3,%xmm7
+ movdqa 112(%esp),%xmm3
+ addl %ebp,%edi
+ andl %ebx,%esi
+ pxor %xmm2,%xmm7
+ pshufd $238,%xmm6,%xmm2
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm3,%xmm4
+ addl %esi,%edx
+ paddd %xmm7,%xmm3
+ andl %eax,%ebp
+ pxor %xmm2,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ roll $5,%edx
+ pslld $2,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ psrld $30,%xmm2
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ xorl %edi,%edx
+ roll $5,%ecx
+ por %xmm2,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ movdqa 96(%esp),%xmm2
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ pshufd $238,%xmm7,%xmm3
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 16(%esp),%edi
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm4,%xmm5
+ rorl $7,%ebx
+ paddd %xmm0,%xmm4
+ addl %eax,%edi
+ pxor %xmm3,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm3
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm3,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ movdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ pshufd $238,%xmm0,%xmm4
+ addl %ecx,%ebx
+ addl 32(%esp),%eax
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ movdqa 128(%esp),%xmm6
+ rorl $7,%ecx
+ paddd %xmm1,%xmm5
+ addl %ebx,%eax
+ pxor %xmm4,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ pslld $2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ psrld $30,%xmm4
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ por %xmm4,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ movdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ pshufd $238,%xmm1,%xmm5
+ addl %edx,%ecx
+ addl 48(%esp),%ebx
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%edx
+ paddd %xmm2,%xmm6
+ addl %ecx,%ebx
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pslld $2,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ psrld $30,%xmm5
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ por %xmm5,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ movdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ pshufd $238,%xmm2,%xmm6
+ addl %edi,%edx
+ addl (%esp),%ecx
+ pxor %xmm0,%xmm4
+ punpcklqdq %xmm3,%xmm6
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ movdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ movdqa %xmm7,%xmm0
+ rorl $7,%edi
+ paddd %xmm3,%xmm7
+ addl %edx,%ecx
+ pxor %xmm6,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm6
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pslld $2,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ psrld $30,%xmm6
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ por %xmm6,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ pshufd $238,%xmm3,%xmm7
+ addl %eax,%edi
+ addl 16(%esp),%edx
+ pxor %xmm1,%xmm5
+ punpcklqdq %xmm4,%xmm7
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ pxor %xmm6,%xmm5
+ movdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm1
+ rorl $7,%eax
+ paddd %xmm4,%xmm0
+ addl %edi,%edx
+ pxor %xmm7,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm7
+ movdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ pslld $2,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ psrld $30,%xmm7
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ por %xmm7,%xmm5
+ addl 28(%esp),%eax
+ movdqa 80(%esp),%xmm7
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pshufd $238,%xmm4,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 32(%esp),%edi
+ pxor %xmm2,%xmm6
+ punpcklqdq %xmm5,%xmm0
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ pxor %xmm7,%xmm6
+ movdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ roll $5,%eax
+ movdqa %xmm1,%xmm2
+ addl %esi,%edi
+ paddd %xmm5,%xmm1
+ xorl %ebx,%ebp
+ pxor %xmm0,%xmm6
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ andl %ebx,%ebp
+ movdqa %xmm6,%xmm0
+ movdqa %xmm1,16(%esp)
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ pslld $2,%xmm6
+ addl %ebp,%edx
+ xorl %eax,%esi
+ psrld $30,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ por %xmm0,%xmm6
+ movl %edx,%ebp
+ xorl %eax,%esi
+ movdqa 96(%esp),%xmm0
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ pshufd $238,%xmm5,%xmm1
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 48(%esp),%eax
+ pxor %xmm3,%xmm7
+ punpcklqdq %xmm6,%xmm1
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ pxor %xmm0,%xmm7
+ movdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ roll $5,%ebx
+ movdqa 144(%esp),%xmm3
+ addl %esi,%eax
+ paddd %xmm6,%xmm2
+ xorl %ecx,%ebp
+ pxor %xmm1,%xmm7
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ andl %ecx,%ebp
+ movdqa %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ pslld $2,%xmm7
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ psrld $30,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ por %xmm1,%xmm7
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ movdqa 64(%esp),%xmm1
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ pshufd $238,%xmm6,%xmm2
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl (%esp),%ebx
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ roll $5,%ecx
+ movdqa %xmm3,%xmm4
+ addl %esi,%ebx
+ paddd %xmm7,%xmm3
+ xorl %edx,%ebp
+ pxor %xmm2,%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ andl %edx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pslld $2,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ psrld $30,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ por %xmm2,%xmm0
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ movdqa 80(%esp),%xmm2
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ pshufd $238,%xmm7,%xmm3
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 16(%esp),%ecx
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ roll $5,%edx
+ movdqa %xmm4,%xmm5
+ addl %esi,%ecx
+ paddd %xmm0,%xmm4
+ xorl %edi,%ebp
+ pxor %xmm3,%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ andl %edi,%ebp
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ pslld $2,%xmm1
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ psrld $30,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ por %xmm3,%xmm1
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ movdqa 96(%esp),%xmm3
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pshufd $238,%xmm0,%xmm4
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 32(%esp),%edx
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ roll $5,%edi
+ movdqa %xmm5,%xmm6
+ addl %esi,%edx
+ paddd %xmm1,%xmm5
+ xorl %eax,%ebp
+ pxor %xmm4,%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ andl %eax,%ebp
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ pslld $2,%xmm2
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ psrld $30,%xmm4
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ por %xmm4,%xmm2
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ movdqa 64(%esp),%xmm4
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ pshufd $238,%xmm1,%xmm5
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ addl 48(%esp),%edi
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%ebx
+ paddd %xmm2,%xmm6
+ addl %eax,%edi
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm5
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm5,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ paddd %xmm3,%xmm7
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ movdqa %xmm7,48(%esp)
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je L007done
+ movdqa 160(%esp),%xmm7
+ movdqa 176(%esp),%xmm6
+ movdqu (%ebp),%xmm0
+ movdqu 16(%ebp),%xmm1
+ movdqu 32(%ebp),%xmm2
+ movdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+.byte 102,15,56,0,198
+ movl %ebp,196(%esp)
+ movdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+.byte 102,15,56,0,206
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ paddd %xmm7,%xmm0
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ movdqa %xmm0,(%esp)
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ psubd %xmm7,%xmm0
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+.byte 102,15,56,0,214
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ paddd %xmm7,%xmm1
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ movdqa %xmm1,16(%esp)
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ psubd %xmm7,%xmm1
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+.byte 102,15,56,0,222
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ paddd %xmm7,%xmm2
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ movdqa %xmm2,32(%esp)
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %ecx,%ebx
+ movl %edx,12(%ebp)
+ xorl %edx,%ebx
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp L006loop
+.align 4,0x90
+L007done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 6,0x90
+LK_XX_XX:
+.long 1518500249,1518500249,1518500249,1518500249
+.long 1859775393,1859775393,1859775393,1859775393
+.long 2400959708,2400959708,2400959708,2400959708
+.long 3395469782,3395469782,3395469782,3395469782
+.long 66051,67438087,134810123,202182159
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif
diff --git a/mac-x86/crypto/sha/sha256-586.S b/mac-x86/crypto/sha/sha256-586.S
index d40db0d..f0ba612 100644
--- a/mac-x86/crypto/sha/sha256-586.S
+++ b/mac-x86/crypto/sha/sha256-586.S
@@ -26,6 +26,27 @@ L000pic_point:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K256(%ebp),%edx
+ movl (%edx),%ecx
+ movl 4(%edx),%ebx
+ testl $1048576,%ecx
+ jnz L002loop
+ movl 8(%edx),%edx
+ testl $16777216,%ecx
+ jz L003no_xmm
+ andl $1073741824,%ecx
+ andl $268435968,%ebx
+ testl $536870912,%edx
+ jnz L004shaext
+ orl %ebx,%ecx
+ andl $1342177280,%ecx
+ cmpl $1342177280,%ecx
+ testl $512,%ebx
+ jnz L005SSSE3
+L003no_xmm:
+ subl %edi,%eax
+ cmpl $256,%eax
+ jae L006unrolled
jmp L002loop
.align 4,0x90
L002loop:
@@ -97,7 +118,7 @@ L002loop:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 4,0x90
-L00300_15:
+L00700_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -135,11 +156,11 @@ L00300_15:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne L00300_15
+ jne L00700_15
movl 156(%esp),%ecx
- jmp L00416_63
+ jmp L00816_63
.align 4,0x90
-L00416_63:
+L00816_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -194,7 +215,7 @@ L00416_63:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne L00416_63
+ jne L00816_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -228,207 +249,6 @@ L00416_63:
popl %ebx
popl %ebp
ret
-.align 5,0x90
-L005loop_shrd:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- bswap %eax
- movl 12(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- bswap %eax
- movl 28(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- bswap %eax
- movl 44(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- bswap %eax
- movl 60(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- addl $64,%edi
- leal -36(%esp),%esp
- movl %edi,104(%esp)
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,8(%esp)
- xorl %ecx,%ebx
- movl %ecx,12(%esp)
- movl %edi,16(%esp)
- movl %ebx,(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edi
- movl %ebx,24(%esp)
- movl %ecx,28(%esp)
- movl %edi,32(%esp)
-.align 4,0x90
-L00600_15_shrd:
- movl %edx,%ecx
- movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl 96(%esp),%ebx
- shrdl $5,%ecx,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- shrdl $9,%ecx,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3248222580,%esi
- jne L00600_15_shrd
- movl 156(%esp),%ecx
- jmp L00716_63_shrd
-.align 4,0x90
-L00716_63_shrd:
- movl %ecx,%ebx
- movl 104(%esp),%esi
- shrdl $11,%ecx,%ecx
- movl %esi,%edi
- shrdl $2,%esi,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- shrdl $17,%esi,%esi
- addl 160(%esp),%ebx
- shrl $10,%edi
- addl 124(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl %ebx,96(%esp)
- shrdl $5,%ecx,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- shrdl $9,%ecx,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- movl 156(%esp),%ecx
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3329325298,%esi
- jne L00716_63_shrd
- movl 356(%esp),%esi
- movl 8(%esp),%ebx
- movl 16(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl 24(%esp),%eax
- movl 28(%esp),%ebx
- movl 32(%esp),%ecx
- movl 360(%esp),%edi
- addl 16(%esi),%edx
- addl 20(%esi),%eax
- addl 24(%esi),%ebx
- addl 28(%esi),%ecx
- movl %edx,16(%esi)
- movl %eax,20(%esi)
- movl %ebx,24(%esi)
- movl %ecx,28(%esi)
- leal 356(%esp),%esp
- subl $256,%ebp
- cmpl 8(%esp),%edi
- jb L005loop_shrd
- movl 12(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.align 6,0x90
L001K256:
.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
@@ -439,7 +259,7 @@ L001K256:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 4,0x90
-L008unrolled:
+L006unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -3345,4 +3165,1417 @@ L009grand_loop:
popl %ebx
popl %ebp
ret
+.align 5,0x90
+L004shaext:
+ subl $32,%esp
+ movdqu (%esi),%xmm1
+ leal 128(%ebp),%ebp
+ movdqu 16(%esi),%xmm2
+ movdqa 128(%ebp),%xmm7
+ pshufd $27,%xmm1,%xmm0
+ pshufd $177,%xmm1,%xmm1
+ pshufd $27,%xmm2,%xmm2
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp L010loop_shaext
+.align 4,0x90
+L010loop_shaext:
+ movdqu (%edi),%xmm3
+ movdqu 16(%edi),%xmm4
+ movdqu 32(%edi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%edi),%xmm6
+ movdqa %xmm2,16(%esp)
+ movdqa -128(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,(%esp)
+.byte 15,56,203,202
+ movdqa -112(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ leal 64(%edi),%edi
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -80(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa -64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa -48(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -16(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa (%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 16(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 48(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 80(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+ movdqa 96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa 128(%ebp),%xmm7
+.byte 15,56,203,202
+ movdqa 112(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ cmpl %edi,%eax
+ nop
+.byte 15,56,203,202
+ paddd 16(%esp),%xmm2
+ paddd (%esp),%xmm1
+ jnz L010loop_shaext
+ pshufd $177,%xmm2,%xmm2
+ pshufd $27,%xmm1,%xmm7
+ pshufd $177,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+ movl 44(%esp),%esp
+ movdqu %xmm1,(%esi)
+ movdqu %xmm2,16(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 5,0x90
+L005SSSE3:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ movdqa 256(%ebp),%xmm7
+ jmp L011grand_ssse3
+.align 4,0x90
+L011grand_ssse3:
+ movdqu (%edi),%xmm0
+ movdqu 16(%edi),%xmm1
+ movdqu 32(%edi),%xmm2
+ movdqu 48(%edi),%xmm3
+ addl $64,%edi
+.byte 102,15,56,0,199
+ movl %edi,100(%esp)
+.byte 102,15,56,0,207
+ movdqa (%ebp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 16(%ebp),%xmm5
+ paddd %xmm0,%xmm4
+.byte 102,15,56,0,223
+ movdqa 32(%ebp),%xmm6
+ paddd %xmm1,%xmm5
+ movdqa 48(%ebp),%xmm7
+ movdqa %xmm4,32(%esp)
+ paddd %xmm2,%xmm6
+ movdqa %xmm5,48(%esp)
+ paddd %xmm3,%xmm7
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm7,80(%esp)
+ jmp L012ssse3_00_47
+.align 4,0x90
+L012ssse3_00_47:
+ addl $64,%ebp
+ movl %edx,%ecx
+ movdqa %xmm1,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,224,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,250,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm3,%xmm7
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm0
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm0
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm0,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa (%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm0,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,32(%esp)
+ movl %edx,%ecx
+ movdqa %xmm2,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,225,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,251,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm0,%xmm7
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm1
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm1
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm1,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 16(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm1,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,48(%esp)
+ movl %edx,%ecx
+ movdqa %xmm3,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,226,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,248,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm1,%xmm7
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm2
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm2
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm2,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 32(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm2,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,64(%esp)
+ movl %edx,%ecx
+ movdqa %xmm0,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,227,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,249,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm2,%xmm7
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm3
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm3
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm3,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 48(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm3,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne L012ssse3_00_47
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ movdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb L011grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif
diff --git a/mac-x86/crypto/sha/sha512-586.S b/mac-x86/crypto/sha/sha512-586.S
index 99dbc31..3066100 100644
--- a/mac-x86/crypto/sha/sha512-586.S
+++ b/mac-x86/crypto/sha/sha512-586.S
@@ -26,6 +26,2269 @@ L000pic_point:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx
+ movl (%edx),%ecx
+ testl $67108864,%ecx
+ jz L002loop_x86
+ movl 4(%edx),%edx
+ movq (%esi),%mm0
+ andl $16777216,%ecx
+ movq 8(%esi),%mm1
+ andl $512,%edx
+ movq 16(%esi),%mm2
+ orl %edx,%ecx
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ cmpl $16777728,%ecx
+ je L003SSSE3
+ subl $80,%esp
+ jmp L004loop_sse2
+.align 4,0x90
+L004loop_sse2:
+ movq %mm1,8(%esp)
+ movq %mm2,16(%esp)
+ movq %mm3,24(%esp)
+ movq %mm5,40(%esp)
+ movq %mm6,48(%esp)
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ movq %mm0,%mm3
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ movl $15,%edx
+ bswap %eax
+ bswap %ebx
+ jmp L00500_14_sse2
+.align 4,0x90
+L00500_14_sse2:
+ movd %eax,%mm1
+ movl (%edi),%eax
+ movd %ebx,%mm7
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ bswap %eax
+ bswap %ebx
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ movq 48(%esp),%mm6
+ decl %edx
+ jnz L00500_14_sse2
+ movd %eax,%mm1
+ movd %ebx,%mm7
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ pxor %mm0,%mm0
+ movl $32,%edx
+ jmp L00616_79_sse2
+.align 4,0x90
+L00616_79_sse2:
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm0
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm2
+ addl $8,%ebp
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm2
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm0
+ addl $8,%ebp
+ decl %edx
+ jnz L00616_79_sse2
+ paddq %mm3,%mm0
+ movq 8(%esp),%mm1
+ movq 24(%esp),%mm3
+ movq 40(%esp),%mm5
+ movq 48(%esp),%mm6
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movl $640,%eax
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ leal (%esp,%eax,1),%esp
+ subl %eax,%ebp
+ cmpl 88(%esp),%edi
+ jb L004loop_sse2
+ movl 92(%esp),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 5,0x90
+L003SSSE3:
+ leal -64(%esp),%edx
+ subl $256,%esp
+ movdqa 640(%ebp),%xmm1
+ movdqu (%edi),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%edi),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%edi),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%edi),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%edi),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%edi),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%edi),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%edi),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movdqa %xmm2,-16(%edx)
+ nop
+.align 5,0x90
+L007loop_ssse3:
+ movdqa 16(%edx),%xmm2
+ movdqa %xmm3,48(%edx)
+ leal 128(%ebp),%ebp
+ movq %mm1,8(%esp)
+ movl %edi,%ebx
+ movq %mm2,16(%esp)
+ leal 128(%edi),%edi
+ movq %mm3,24(%esp)
+ cmpl %eax,%edi
+ movq %mm5,40(%esp)
+ cmovbl %edi,%ebx
+ movq %mm6,48(%esp)
+ movl $4,%ecx
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ pxor %mm3,%mm3
+ jmp L00800_47_ssse3
+.align 5,0x90
+L00800_47_ssse3:
+ movdqa %xmm5,%xmm3
+ movdqa %xmm2,%xmm1
+.byte 102,15,58,15,208,8
+ movdqa %xmm4,(%edx)
+.byte 102,15,58,15,220,8
+ movdqa %xmm2,%xmm4
+ psrlq $7,%xmm2
+ paddq %xmm3,%xmm0
+ movdqa %xmm4,%xmm3
+ psrlq $1,%xmm4
+ psllq $56,%xmm3
+ pxor %xmm4,%xmm2
+ psrlq $7,%xmm4
+ pxor %xmm3,%xmm2
+ psllq $7,%xmm3
+ pxor %xmm4,%xmm2
+ movdqa %xmm7,%xmm4
+ pxor %xmm3,%xmm2
+ movdqa %xmm7,%xmm3
+ psrlq $6,%xmm4
+ paddq %xmm2,%xmm0
+ movdqa %xmm7,%xmm2
+ psrlq $19,%xmm3
+ psllq $3,%xmm2
+ pxor %xmm3,%xmm4
+ psrlq $42,%xmm3
+ pxor %xmm2,%xmm4
+ psllq $42,%xmm2
+ pxor %xmm3,%xmm4
+ movdqa 32(%edx),%xmm3
+ pxor %xmm2,%xmm4
+ movdqa (%ebp),%xmm2
+ movq %mm4,%mm1
+ paddq %xmm4,%xmm0
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm0,%xmm2
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm2,-128(%edx)
+ movdqa %xmm6,%xmm4
+ movdqa %xmm3,%xmm2
+.byte 102,15,58,15,217,8
+ movdqa %xmm5,16(%edx)
+.byte 102,15,58,15,229,8
+ movdqa %xmm3,%xmm5
+ psrlq $7,%xmm3
+ paddq %xmm4,%xmm1
+ movdqa %xmm5,%xmm4
+ psrlq $1,%xmm5
+ psllq $56,%xmm4
+ pxor %xmm5,%xmm3
+ psrlq $7,%xmm5
+ pxor %xmm4,%xmm3
+ psllq $7,%xmm4
+ pxor %xmm5,%xmm3
+ movdqa %xmm0,%xmm5
+ pxor %xmm4,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $6,%xmm5
+ paddq %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psrlq $19,%xmm4
+ psllq $3,%xmm3
+ pxor %xmm4,%xmm5
+ psrlq $42,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $42,%xmm3
+ pxor %xmm4,%xmm5
+ movdqa 48(%edx),%xmm4
+ pxor %xmm3,%xmm5
+ movdqa 16(%ebp),%xmm3
+ movq %mm4,%mm1
+ paddq %xmm5,%xmm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm1,%xmm3
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm3,-112(%edx)
+ movdqa %xmm7,%xmm5
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,226,8
+ movdqa %xmm6,32(%edx)
+.byte 102,15,58,15,238,8
+ movdqa %xmm4,%xmm6
+ psrlq $7,%xmm4
+ paddq %xmm5,%xmm2
+ movdqa %xmm6,%xmm5
+ psrlq $1,%xmm6
+ psllq $56,%xmm5
+ pxor %xmm6,%xmm4
+ psrlq $7,%xmm6
+ pxor %xmm5,%xmm4
+ psllq $7,%xmm5
+ pxor %xmm6,%xmm4
+ movdqa %xmm1,%xmm6
+ pxor %xmm5,%xmm4
+ movdqa %xmm1,%xmm5
+ psrlq $6,%xmm6
+ paddq %xmm4,%xmm2
+ movdqa %xmm1,%xmm4
+ psrlq $19,%xmm5
+ psllq $3,%xmm4
+ pxor %xmm5,%xmm6
+ psrlq $42,%xmm5
+ pxor %xmm4,%xmm6
+ psllq $42,%xmm4
+ pxor %xmm5,%xmm6
+ movdqa (%edx),%xmm5
+ pxor %xmm4,%xmm6
+ movdqa 32(%ebp),%xmm4
+ movq %mm4,%mm1
+ paddq %xmm6,%xmm2
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm2,%xmm4
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm4,-96(%edx)
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm4
+.byte 102,15,58,15,235,8
+ movdqa %xmm7,48(%edx)
+.byte 102,15,58,15,247,8
+ movdqa %xmm5,%xmm7
+ psrlq $7,%xmm5
+ paddq %xmm6,%xmm3
+ movdqa %xmm7,%xmm6
+ psrlq $1,%xmm7
+ psllq $56,%xmm6
+ pxor %xmm7,%xmm5
+ psrlq $7,%xmm7
+ pxor %xmm6,%xmm5
+ psllq $7,%xmm6
+ pxor %xmm7,%xmm5
+ movdqa %xmm2,%xmm7
+ pxor %xmm6,%xmm5
+ movdqa %xmm2,%xmm6
+ psrlq $6,%xmm7
+ paddq %xmm5,%xmm3
+ movdqa %xmm2,%xmm5
+ psrlq $19,%xmm6
+ psllq $3,%xmm5
+ pxor %xmm6,%xmm7
+ psrlq $42,%xmm6
+ pxor %xmm5,%xmm7
+ psllq $42,%xmm5
+ pxor %xmm6,%xmm7
+ movdqa 16(%edx),%xmm6
+ pxor %xmm5,%xmm7
+ movdqa 48(%ebp),%xmm5
+ movq %mm4,%mm1
+ paddq %xmm7,%xmm3
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm3,%xmm5
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm5,-80(%edx)
+ movdqa %xmm1,%xmm7
+ movdqa %xmm6,%xmm5
+.byte 102,15,58,15,244,8
+ movdqa %xmm0,(%edx)
+.byte 102,15,58,15,248,8
+ movdqa %xmm6,%xmm0
+ psrlq $7,%xmm6
+ paddq %xmm7,%xmm4
+ movdqa %xmm0,%xmm7
+ psrlq $1,%xmm0
+ psllq $56,%xmm7
+ pxor %xmm0,%xmm6
+ psrlq $7,%xmm0
+ pxor %xmm7,%xmm6
+ psllq $7,%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm3,%xmm0
+ pxor %xmm7,%xmm6
+ movdqa %xmm3,%xmm7
+ psrlq $6,%xmm0
+ paddq %xmm6,%xmm4
+ movdqa %xmm3,%xmm6
+ psrlq $19,%xmm7
+ psllq $3,%xmm6
+ pxor %xmm7,%xmm0
+ psrlq $42,%xmm7
+ pxor %xmm6,%xmm0
+ psllq $42,%xmm6
+ pxor %xmm7,%xmm0
+ movdqa 32(%edx),%xmm7
+ pxor %xmm6,%xmm0
+ movdqa 64(%ebp),%xmm6
+ movq %mm4,%mm1
+ paddq %xmm0,%xmm4
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm4,%xmm6
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm6,-64(%edx)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm6
+.byte 102,15,58,15,253,8
+ movdqa %xmm1,16(%edx)
+.byte 102,15,58,15,193,8
+ movdqa %xmm7,%xmm1
+ psrlq $7,%xmm7
+ paddq %xmm0,%xmm5
+ movdqa %xmm1,%xmm0
+ psrlq $1,%xmm1
+ psllq $56,%xmm0
+ pxor %xmm1,%xmm7
+ psrlq $7,%xmm1
+ pxor %xmm0,%xmm7
+ psllq $7,%xmm0
+ pxor %xmm1,%xmm7
+ movdqa %xmm4,%xmm1
+ pxor %xmm0,%xmm7
+ movdqa %xmm4,%xmm0
+ psrlq $6,%xmm1
+ paddq %xmm7,%xmm5
+ movdqa %xmm4,%xmm7
+ psrlq $19,%xmm0
+ psllq $3,%xmm7
+ pxor %xmm0,%xmm1
+ psrlq $42,%xmm0
+ pxor %xmm7,%xmm1
+ psllq $42,%xmm7
+ pxor %xmm0,%xmm1
+ movdqa 48(%edx),%xmm0
+ pxor %xmm7,%xmm1
+ movdqa 80(%ebp),%xmm7
+ movq %mm4,%mm1
+ paddq %xmm1,%xmm5
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm5,%xmm7
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm7,-48(%edx)
+ movdqa %xmm3,%xmm1
+ movdqa %xmm0,%xmm7
+.byte 102,15,58,15,198,8
+ movdqa %xmm2,32(%edx)
+.byte 102,15,58,15,202,8
+ movdqa %xmm0,%xmm2
+ psrlq $7,%xmm0
+ paddq %xmm1,%xmm6
+ movdqa %xmm2,%xmm1
+ psrlq $1,%xmm2
+ psllq $56,%xmm1
+ pxor %xmm2,%xmm0
+ psrlq $7,%xmm2
+ pxor %xmm1,%xmm0
+ psllq $7,%xmm1
+ pxor %xmm2,%xmm0
+ movdqa %xmm5,%xmm2
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm1
+ psrlq $6,%xmm2
+ paddq %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $19,%xmm1
+ psllq $3,%xmm0
+ pxor %xmm1,%xmm2
+ psrlq $42,%xmm1
+ pxor %xmm0,%xmm2
+ psllq $42,%xmm0
+ pxor %xmm1,%xmm2
+ movdqa (%edx),%xmm1
+ pxor %xmm0,%xmm2
+ movdqa 96(%ebp),%xmm0
+ movq %mm4,%mm1
+ paddq %xmm2,%xmm6
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm6,%xmm0
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm0,-32(%edx)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm1,%xmm0
+.byte 102,15,58,15,207,8
+ movdqa %xmm3,48(%edx)
+.byte 102,15,58,15,211,8
+ movdqa %xmm1,%xmm3
+ psrlq $7,%xmm1
+ paddq %xmm2,%xmm7
+ movdqa %xmm3,%xmm2
+ psrlq $1,%xmm3
+ psllq $56,%xmm2
+ pxor %xmm3,%xmm1
+ psrlq $7,%xmm3
+ pxor %xmm2,%xmm1
+ psllq $7,%xmm2
+ pxor %xmm3,%xmm1
+ movdqa %xmm6,%xmm3
+ pxor %xmm2,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $6,%xmm3
+ paddq %xmm1,%xmm7
+ movdqa %xmm6,%xmm1
+ psrlq $19,%xmm2
+ psllq $3,%xmm1
+ pxor %xmm2,%xmm3
+ psrlq $42,%xmm2
+ pxor %xmm1,%xmm3
+ psllq $42,%xmm1
+ pxor %xmm2,%xmm3
+ movdqa 16(%edx),%xmm2
+ pxor %xmm1,%xmm3
+ movdqa 112(%ebp),%xmm1
+ movq %mm4,%mm1
+ paddq %xmm3,%xmm7
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm7,%xmm1
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm1,-16(%edx)
+ leal 128(%ebp),%ebp
+ decl %ecx
+ jnz L00800_47_ssse3
+ movdqa (%ebp),%xmm1
+ leal -640(%ebp),%ebp
+ movdqu (%ebx),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%ebx),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movq %mm4,%mm1
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%ebx),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movq %mm4,%mm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%ebx),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movq %mm4,%mm1
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%ebx),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movq %mm4,%mm1
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%ebx),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movq %mm4,%mm1
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%ebx),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movq %mm4,%mm1
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%ebx),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movq %mm4,%mm1
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movq %mm4,%mm1
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm2,-16(%edx)
+ movq 8(%esp),%mm1
+ paddq %mm3,%mm0
+ movq 24(%esp),%mm3
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ cmpl %eax,%edi
+ jb L007loop_ssse3
+ movl 76(%edx),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.align 4,0x90
L002loop_x86:
movl (%edi),%eax
@@ -131,7 +2394,7 @@ L002loop_x86:
movl $16,%ecx
.long 2784229001
.align 4,0x90
-L00300_15_x86:
+L00900_15_x86:
movl 40(%esp),%ecx
movl 44(%esp),%edx
movl %ecx,%esi
@@ -238,9 +2501,9 @@ L00300_15_x86:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $148,%dl
- jne L00300_15_x86
+ jne L00900_15_x86
.align 4,0x90
-L00416_79_x86:
+L01016_79_x86:
movl 312(%esp),%ecx
movl 316(%esp),%edx
movl %ecx,%esi
@@ -413,7 +2676,7 @@ L00416_79_x86:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $23,%dl
- jne L00416_79_x86
+ jne L01016_79_x86
movl 840(%esp),%esi
movl 844(%esp),%edi
movl (%esi),%eax
@@ -563,4 +2826,8 @@ L001K512:
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif