diff options
Diffstat (limited to 'src/crypto/aes')
-rw-r--r-- | src/crypto/aes/aes.c | 24 | ||||
-rw-r--r-- | src/crypto/aes/asm/aes-armv4.pl | 43 | ||||
-rw-r--r-- | src/crypto/aes/asm/aesni-x86.pl | 319 | ||||
-rw-r--r-- | src/crypto/aes/asm/aesni-x86_64.pl | 945 | ||||
-rw-r--r-- | src/crypto/aes/asm/aesv8-armx.pl | 228 | ||||
-rw-r--r-- | src/crypto/aes/asm/bsaes-armv7.pl | 54 |
6 files changed, 1252 insertions, 361 deletions
diff --git a/src/crypto/aes/aes.c b/src/crypto/aes/aes.c index 97b4fbd..933aa07 100644 --- a/src/crypto/aes/aes.c +++ b/src/crypto/aes/aes.c @@ -1033,17 +1033,25 @@ void AES_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { #endif /* ?FULL_UNROLL */ /* apply last round and * map cipher state to byte array block: */ - s0 = (Td4[(t0 >> 24)] << 24) ^ (Td4[(t3 >> 16) & 0xff] << 16) ^ - (Td4[(t2 >> 8) & 0xff] << 8) ^ (Td4[(t1) & 0xff]) ^ rk[0]; + s0 = ((uint32_t)Td4[(t0 >> 24)] << 24) ^ + ((uint32_t)Td4[(t3 >> 16) & 0xff] << 16) ^ + ((uint32_t)Td4[(t2 >> 8) & 0xff] << 8) ^ + ((uint32_t)Td4[(t1) & 0xff]) ^ rk[0]; PUTU32(out, s0); - s1 = (Td4[(t1 >> 24)] << 24) ^ (Td4[(t0 >> 16) & 0xff] << 16) ^ - (Td4[(t3 >> 8) & 0xff] << 8) ^ (Td4[(t2) & 0xff]) ^ rk[1]; + s1 = ((uint32_t)Td4[(t1 >> 24)] << 24) ^ + ((uint32_t)Td4[(t0 >> 16) & 0xff] << 16) ^ + ((uint32_t)Td4[(t3 >> 8) & 0xff] << 8) ^ + ((uint32_t)Td4[(t2) & 0xff]) ^ rk[1]; PUTU32(out + 4, s1); - s2 = (Td4[(t2 >> 24)] << 24) ^ (Td4[(t1 >> 16) & 0xff] << 16) ^ - (Td4[(t0 >> 8) & 0xff] << 8) ^ (Td4[(t3) & 0xff]) ^ rk[2]; + s2 = ((uint32_t)Td4[(t2 >> 24)] << 24) ^ + ((uint32_t)Td4[(t1 >> 16) & 0xff] << 16) ^ + ((uint32_t)Td4[(t0 >> 8) & 0xff] << 8) ^ + ((uint32_t)Td4[(t3) & 0xff]) ^ rk[2]; PUTU32(out + 8, s2); - s3 = (Td4[(t3 >> 24)] << 24) ^ (Td4[(t2 >> 16) & 0xff] << 16) ^ - (Td4[(t1 >> 8) & 0xff] << 8) ^ (Td4[(t0) & 0xff]) ^ rk[3]; + s3 = ((uint32_t)Td4[(t3 >> 24)] << 24) ^ + ((uint32_t)Td4[(t2 >> 16) & 0xff] << 16) ^ + ((uint32_t)Td4[(t1 >> 8) & 0xff] << 8) ^ + ((uint32_t)Td4[(t0) & 0xff]) ^ rk[3]; PUTU32(out + 12, s3); } diff --git a/src/crypto/aes/asm/aes-armv4.pl b/src/crypto/aes/asm/aes-armv4.pl index 3bd9a6d..36cd3b6 100644 --- a/src/crypto/aes/asm/aes-armv4.pl +++ b/src/crypto/aes/asm/aes-armv4.pl @@ -32,8 +32,20 @@ # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~21.5 cycles per byte. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $s0="r0"; $s1="r1"; @@ -63,7 +75,7 @@ $code=<<___; .code 32 #else .syntax unified -# ifdef __thumb2__ +# if defined(__thumb2__) && !defined(__APPLE__) .thumb # else .code 32 @@ -189,9 +201,13 @@ asm_AES_encrypt: adr r3,asm_AES_encrypt #endif stmdb sp!,{r1,r4-r12,lr} +#ifdef __APPLE__ + adr $tbl,AES_Te +#else + sub $tbl,r3,#asm_AES_encrypt-AES_Te @ Te +#endif mov $rounds,r0 @ inp mov $key,r2 - sub $tbl,r3,#asm_AES_encrypt-AES_Te @ Te #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -460,12 +476,16 @@ _armv4_AES_set_encrypt_key: bne .Labrt .Lok: stmdb sp!,{r4-r12,lr} - sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 - mov $rounds,r0 @ inp mov lr,r1 @ bits mov $key,r2 @ key +#ifdef __APPLE__ + adr $tbl,AES_Te+1024 @ Te4 +#else + sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 +#endif + #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -718,8 +738,8 @@ _armv4_AES_set_encrypt_key: .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} .Labrt: -#if defined(__thumb2__) && __ARM_ARCH__>=7 - .short 0x4770 @ bx lr in Thumb2 encoding +#if __ARM_ARCH__>=5 + ret @ bx lr #else tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet @@ -961,9 +981,13 @@ asm_AES_decrypt: adr r3,asm_AES_decrypt #endif stmdb sp!,{r1,r4-r12,lr} +#ifdef __APPLE__ + adr $tbl,AES_Td +#else + sub $tbl,r3,#asm_AES_decrypt-AES_Td @ Td +#endif mov $rounds,r0 @ inp mov $key,r2 - sub $tbl,r3,#asm_AES_decrypt-AES_Td @ Td #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -1211,6 +1235,7 @@ _armv4_AES_decrypt: ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx\tlr/gm; open SELF,$0; while(<SELF>) { diff --git a/src/crypto/aes/asm/aesni-x86.pl b/src/crypto/aes/asm/aesni-x86.pl index 3deb86a..f67df8c 100644 --- a/src/crypto/aes/asm/aesni-x86.pl +++ b/src/crypto/aes/asm/aesni-x86.pl @@ -51,7 +51,7 @@ # Westmere 3.77/1.37 1.37 1.52 1.27 # * Bridge 5.07/0.98 0.99 1.09 0.91 # Haswell 4.44/0.80 0.97 1.03 0.72 -# Atom 5.77/3.56 3.67 4.03 3.46 +# Silvermont 5.77/3.56 3.67 4.03 3.46 # Bulldozer 5.80/0.98 1.05 1.24 0.93 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script @@ -65,6 +65,9 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); +&external_label("OPENSSL_ia32cap_P"); +&static_label("key_const"); + if ($PREFIX eq "aesni") { $movekey=\&movups; } else { $movekey=\&movups; } @@ -181,7 +184,10 @@ sub aesni_generate1 # fully unrolled loop { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } + &pxor ($rndkey0,$rndkey0); # clear register bank + &pxor ($rndkey1,$rndkey1); &movups (&QWP(0,"eax"),$inout0); + &pxor ($inout0,$inout0); &ret (); &function_end_B("${PREFIX}_encrypt"); @@ -197,7 +203,10 @@ sub aesni_generate1 # fully unrolled loop { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } + &pxor ($rndkey0,$rndkey0); # clear register bank + &pxor ($rndkey1,$rndkey1); &movups (&QWP(0,"eax"),$inout0); + &pxor ($inout0,$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); @@ -349,17 +358,15 @@ sub aesni_generate6 &neg ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; &pxor ($inout5,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key,$rounds)); &add ($rounds,16); - eval"&aes${p} ($inout3,$rndkey1)"; - eval"&aes${p} ($inout4,$rndkey1)"; - eval"&aes${p} ($inout5,$rndkey1)"; - &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); - &jmp (&label("_aesni_${p}rypt6_enter")); + &jmp (&label("_aesni_${p}rypt6_inner")); &set_label("${p}6_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; + &set_label("_aesni_${p}rypt6_inner"); eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p} ($inout4,$rndkey1)"; eval"&aes${p} ($inout5,$rndkey1)"; @@ -615,6 +622,14 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x30,$out),$inout3); &set_label("ecb_ret"); + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); &function_end("aesni_ecb_encrypt"); ###################################################################### @@ -704,6 +719,15 @@ if ($PREFIX eq "aesni") { &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movups (&QWP(0,$out),$cmac); + + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); &function_end("aesni_ccm64_encrypt_blocks"); &function_begin("aesni_ccm64_decrypt_blocks"); @@ -804,6 +828,15 @@ if ($PREFIX eq "aesni") { &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); &movups (&QWP(0,$out),$cmac); + + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); &function_end("aesni_ccm64_decrypt_blocks"); } @@ -1053,6 +1086,17 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x30,$out),$inout3); &set_label("ctr32_ret"); + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack + &pxor ("xmm5","xmm5"); + &movdqa (&QWP(48,"esp"),"xmm0"); + &pxor ("xmm6","xmm6"); + &movdqa (&QWP(64,"esp"),"xmm0"); + &pxor ("xmm7","xmm7"); &mov ("esp",&DWP(80,"esp")); &function_end("aesni_ctr32_encrypt_blocks"); @@ -1394,6 +1438,20 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-16,$out),$inout0); # write output &set_label("xts_enc_ret"); + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack + &pxor ("xmm3","xmm3"); + &movdqa (&QWP(16*1,"esp"),"xmm0"); + &pxor ("xmm4","xmm4"); + &movdqa (&QWP(16*2,"esp"),"xmm0"); + &pxor ("xmm5","xmm5"); + &movdqa (&QWP(16*3,"esp"),"xmm0"); + &pxor ("xmm6","xmm6"); + &movdqa (&QWP(16*4,"esp"),"xmm0"); + &pxor ("xmm7","xmm7"); + &movdqa (&QWP(16*5,"esp"),"xmm0"); &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp &function_end("aesni_xts_encrypt"); @@ -1756,6 +1814,20 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0,$out),$inout0); # write output &set_label("xts_dec_ret"); + &pxor ("xmm0","xmm0"); # clear register bank + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack + &pxor ("xmm3","xmm3"); + &movdqa (&QWP(16*1,"esp"),"xmm0"); + &pxor ("xmm4","xmm4"); + &movdqa (&QWP(16*2,"esp"),"xmm0"); + &pxor ("xmm5","xmm5"); + &movdqa (&QWP(16*3,"esp"),"xmm0"); + &pxor ("xmm6","xmm6"); + &movdqa (&QWP(16*4,"esp"),"xmm0"); + &pxor ("xmm7","xmm7"); + &movdqa (&QWP(16*5,"esp"),"xmm0"); &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp &function_end("aesni_xts_decrypt"); } @@ -1808,6 +1880,7 @@ if ($PREFIX eq "aesni") { &add ($len,16); &jnz (&label("cbc_enc_tail")); &movaps ($ivec,$inout0); + &pxor ($inout0,$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_enc_tail"); @@ -1871,7 +1944,7 @@ if ($PREFIX eq "aesni") { &movaps ($inout0,$inout5); &movaps ($ivec,$rndkey0); &add ($len,0x50); - &jle (&label("cbc_dec_tail_collected")); + &jle (&label("cbc_dec_clear_tail_collected")); &movups (&QWP(0,$out),$inout0); &lea ($out,&DWP(0x10,$out)); &set_label("cbc_dec_tail"); @@ -1910,10 +1983,14 @@ if ($PREFIX eq "aesni") { &xorps ($inout4,$rndkey0); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); + &pxor ($inout1,$inout1); &movups (&QWP(0x20,$out),$inout2); + &pxor ($inout2,$inout2); &movups (&QWP(0x30,$out),$inout3); + &pxor ($inout3,$inout3); &lea ($out,&DWP(0x40,$out)); &movaps ($inout0,$inout4); + &pxor ($inout4,$inout4); &sub ($len,0x50); &jmp (&label("cbc_dec_tail_collected")); @@ -1933,6 +2010,7 @@ if ($PREFIX eq "aesni") { &xorps ($inout1,$in0); &movups (&QWP(0,$out),$inout0); &movaps ($inout0,$inout1); + &pxor ($inout1,$inout1); &lea ($out,&DWP(0x10,$out)); &movaps ($ivec,$in1); &sub ($len,0x20); @@ -1945,7 +2023,9 @@ if ($PREFIX eq "aesni") { &xorps ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movaps ($inout0,$inout2); + &pxor ($inout2,$inout2); &movups (&QWP(0x10,$out),$inout1); + &pxor ($inout1,$inout1); &lea ($out,&DWP(0x20,$out)); &movups ($ivec,&QWP(0x20,$inp)); &sub ($len,0x30); @@ -1961,29 +2041,44 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0,$out),$inout0); &xorps ($inout2,$rndkey1); &movups (&QWP(0x10,$out),$inout1); + &pxor ($inout1,$inout1); &xorps ($inout3,$rndkey0); &movups (&QWP(0x20,$out),$inout2); + &pxor ($inout2,$inout2); &lea ($out,&DWP(0x30,$out)); &movaps ($inout0,$inout3); + &pxor ($inout3,$inout3); &sub ($len,0x40); + &jmp (&label("cbc_dec_tail_collected")); +&set_label("cbc_dec_clear_tail_collected",16); + &pxor ($inout1,$inout1); + &pxor ($inout2,$inout2); + &pxor ($inout3,$inout3); + &pxor ($inout4,$inout4); &set_label("cbc_dec_tail_collected"); &and ($len,15); &jnz (&label("cbc_dec_tail_partial")); &movups (&QWP(0,$out),$inout0); + &pxor ($rndkey0,$rndkey0); &jmp (&label("cbc_ret")); &set_label("cbc_dec_tail_partial",16); &movaps (&QWP(0,"esp"),$inout0); + &pxor ($rndkey0,$rndkey0); &mov ("ecx",16); &mov ($inp,"esp"); &sub ("ecx",$len); &data_word(0xA4F3F689); # rep movsb + &movdqa (&QWP(0,"esp"),$inout0); &set_label("cbc_ret"); &mov ("esp",&DWP(16,"esp")); # pull original %esp &mov ($key_,&wparam(4)); + &pxor ($inout0,$inout0); + &pxor ($rndkey1,$rndkey1); &movups (&QWP(0,$key_),$ivec); # output IV + &pxor ($ivec,$ivec); &set_label("cbc_abort"); &function_end("${PREFIX}_cbc_encrypt"); @@ -2000,14 +2095,24 @@ if ($PREFIX eq "aesni") { # $round rounds &function_begin_B("_aesni_set_encrypt_key"); + &push ("ebp"); + &push ("ebx"); &test ("eax","eax"); &jz (&label("bad_pointer")); &test ($key,$key); &jz (&label("bad_pointer")); + &call (&label("pic")); +&set_label("pic"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); + + &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &mov ("ebp",&DWP(4,"ebp")); &lea ($key,&DWP(16,$key)); + &and ("ebp",1<<28|1<<11); # AVX and XOP bits &cmp ($rounds,256); &je (&label("14rounds")); &cmp ($rounds,192); @@ -2016,6 +2121,9 @@ if ($PREFIX eq "aesni") { &jne (&label("bad_keybits")); &set_label("10rounds",16); + &cmp ("ebp",1<<28); + &je (&label("10rounds_alt")); + &mov ($rounds,9); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 @@ -2040,8 +2148,8 @@ if ($PREFIX eq "aesni") { &call (&label("key_128")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(80,$key),$rounds); - &xor ("eax","eax"); - &ret(); + + &jmp (&label("good_key")); &set_label("key_128",16); &$movekey (&QWP(0,$key),"xmm0"); @@ -2055,8 +2163,76 @@ if ($PREFIX eq "aesni") { &xorps ("xmm0","xmm1"); &ret(); +&set_label("10rounds_alt",16); + &movdqa ("xmm5",&QWP(0x00,"ebx")); + &mov ($rounds,8); + &movdqa ("xmm4",&QWP(0x20,"ebx")); + &movdqa ("xmm2","xmm0"); + &movdqu (&QWP(-16,$key),"xmm0"); + +&set_label("loop_key128"); + &pshufb ("xmm0","xmm5"); + &aesenclast ("xmm0","xmm4"); + &pslld ("xmm4",1); + &lea ($key,&DWP(16,$key)); + + &movdqa ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm2","xmm3"); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(-16,$key),"xmm0"); + &movdqa ("xmm2","xmm0"); + + &dec ($rounds); + &jnz (&label("loop_key128")); + + &movdqa ("xmm4",&QWP(0x30,"ebx")); + + &pshufb ("xmm0","xmm5"); + &aesenclast ("xmm0","xmm4"); + &pslld ("xmm4",1); + + &movdqa ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm2","xmm3"); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(0,$key),"xmm0"); + + &movdqa ("xmm2","xmm0"); + &pshufb ("xmm0","xmm5"); + &aesenclast ("xmm0","xmm4"); + + &movdqa ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm3","xmm2"); + &pslldq ("xmm2",4); + &pxor ("xmm2","xmm3"); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(16,$key),"xmm0"); + + &mov ($rounds,9); + &mov (&DWP(96,$key),$rounds); + + &jmp (&label("good_key")); + &set_label("12rounds",16); &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey + &cmp ("ebp",1<<28); + &je (&label("12rounds_alt")); + &mov ($rounds,11); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 @@ -2077,8 +2253,8 @@ if ($PREFIX eq "aesni") { &call (&label("key_192b")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(48,$key),$rounds); - &xor ("eax","eax"); - &ret(); + + &jmp (&label("good_key")); &set_label("key_192a",16); &$movekey (&QWP(0,$key),"xmm0"); @@ -2108,10 +2284,52 @@ if ($PREFIX eq "aesni") { &lea ($key,&DWP(32,$key)); &jmp (&label("key_192b_warm")); +&set_label("12rounds_alt",16); + &movdqa ("xmm5",&QWP(0x10,"ebx")); + &movdqa ("xmm4",&QWP(0x20,"ebx")); + &mov ($rounds,8); + &movdqu (&QWP(-16,$key),"xmm0"); + +&set_label("loop_key192"); + &movq (&QWP(0,$key),"xmm2"); + &movdqa ("xmm1","xmm2"); + &pshufb ("xmm2","xmm5"); + &aesenclast ("xmm2","xmm4"); + &pslld ("xmm4",1); + &lea ($key,&DWP(24,$key)); + + &movdqa ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm0","xmm3"); + + &pshufd ("xmm3","xmm0",0xff); + &pxor ("xmm3","xmm1"); + &pslldq ("xmm1",4); + &pxor ("xmm3","xmm1"); + + &pxor ("xmm0","xmm2"); + &pxor ("xmm2","xmm3"); + &movdqu (&QWP(-16,$key),"xmm0"); + + &dec ($rounds); + &jnz (&label("loop_key192")); + + &mov ($rounds,11); + &mov (&DWP(32,$key),$rounds); + + &jmp (&label("good_key")); + &set_label("14rounds",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey - &mov ($rounds,13); &lea ($key,&DWP(16,$key)); + &cmp ("ebp",1<<28); + &je (&label("14rounds_alt")); + + &mov ($rounds,13); &$movekey (&QWP(-32,$key),"xmm0"); # round 0 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 @@ -2143,7 +2361,8 @@ if ($PREFIX eq "aesni") { &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(16,$key),$rounds); &xor ("eax","eax"); - &ret(); + + &jmp (&label("good_key")); &set_label("key_256a",16); &$movekey (&QWP(0,$key),"xmm2"); @@ -2169,11 +2388,77 @@ if ($PREFIX eq "aesni") { &xorps ("xmm2","xmm1"); &ret(); +&set_label("14rounds_alt",16); + &movdqa ("xmm5",&QWP(0x00,"ebx")); + &movdqa ("xmm4",&QWP(0x20,"ebx")); + &mov ($rounds,7); + &movdqu (&QWP(-32,$key),"xmm0"); + &movdqa ("xmm1","xmm2"); + &movdqu (&QWP(-16,$key),"xmm2"); + +&set_label("loop_key256"); + &pshufb ("xmm2","xmm5"); + &aesenclast ("xmm2","xmm4"); + + &movdqa ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm3","xmm0"); + &pslldq ("xmm0",4); + &pxor ("xmm0","xmm3"); + &pslld ("xmm4",1); + + &pxor ("xmm0","xmm2"); + &movdqu (&QWP(0,$key),"xmm0"); + + &dec ($rounds); + &jz (&label("done_key256")); + + &pshufd ("xmm2","xmm0",0xff); + &pxor ("xmm3","xmm3"); + &aesenclast ("xmm2","xmm3"); + + &movdqa ("xmm3","xmm1") + &pslldq ("xmm1",4); + &pxor ("xmm3","xmm1"); + &pslldq ("xmm1",4); + &pxor ("xmm3","xmm1"); + &pslldq ("xmm1",4); + &pxor ("xmm1","xmm3"); + + &pxor ("xmm2","xmm1"); + &movdqu (&QWP(16,$key),"xmm2"); + &lea ($key,&DWP(32,$key)); + &movdqa ("xmm1","xmm2"); + &jmp (&label("loop_key256")); + +&set_label("done_key256"); + &mov ($rounds,13); + &mov (&DWP(16,$key),$rounds); + +&set_label("good_key"); + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &xor ("eax","eax"); + &pop ("ebx"); + &pop ("ebp"); + &ret (); + &set_label("bad_pointer",4); &mov ("eax",-1); + &pop ("ebx"); + &pop ("ebp"); &ret (); &set_label("bad_keybits",4); + &pxor ("xmm0","xmm0"); &mov ("eax",-2); + &pop ("ebx"); + &pop ("ebp"); &ret (); &function_end_B("_aesni_set_encrypt_key"); @@ -2223,10 +2508,18 @@ if ($PREFIX eq "aesni") { &aesimc ("xmm0","xmm0"); &$movekey (&QWP(0,$key),"xmm0"); + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); &xor ("eax","eax"); # return success &set_label("dec_key_ret"); &ret (); &function_end_B("${PREFIX}_set_decrypt_key"); + +&set_label("key_const",64); +&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); +&data_word(0x04070605,0x04070605,0x04070605,0x04070605); +&data_word(1,1,1,1); +&data_word(0x1b,0x1b,0x1b,0x1b); &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); diff --git a/src/crypto/aes/asm/aesni-x86_64.pl b/src/crypto/aes/asm/aesni-x86_64.pl index 5f61746..25ca574 100644 --- a/src/crypto/aes/asm/aesni-x86_64.pl +++ b/src/crypto/aes/asm/aesni-x86_64.pl @@ -165,11 +165,11 @@ # Westmere 3.77/1.25 1.25 1.25 1.26 # * Bridge 5.07/0.74 0.75 0.90 0.85 # Haswell 4.44/0.63 0.63 0.73 0.63 -# Atom 5.75/3.54 3.56 4.12 3.87(*) +# Silvermont 5.75/3.54 3.56 4.12 3.87(*) # Bulldozer 5.77/0.70 0.72 0.90 0.70 # -# (*) Atom ECB result is suboptimal because of penalties incurred -# by operations on %xmm8-15. As ECB is not considered +# (*) Atom Silvermont ECB result is suboptimal because of penalties +# incurred by operations on %xmm8-15. As ECB is not considered # critical, nothing was done to mitigate the problem. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script @@ -263,7 +263,10 @@ ${PREFIX}_encrypt: ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 movups $inout0,($out) # output + pxor $inout0,$inout0 ret .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt @@ -276,7 +279,10 @@ ${PREFIX}_decrypt: ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 movups $inout0,($out) # output + pxor $inout0,$inout0 ret .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt ___ @@ -445,21 +451,18 @@ _aesni_${dir}rypt6: pxor $rndkey0,$inout4 aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout5 + $movkey ($key,%rax),$rndkey0 add \$16,%rax - aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 - $movkey -16($key,%rax),$rndkey0 jmp .L${dir}_loop6_enter .align 16 .L${dir}_loop6: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 +.L${dir}_loop6_enter: aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 -.L${dir}_loop6_enter: $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 @@ -506,23 +509,18 @@ _aesni_${dir}rypt8: lea 32($key,$rounds),$key neg %rax # $rounds aes${dir} $rndkey1,$inout0 - add \$16,%rax pxor $rndkey0,$inout5 - aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout6 + aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout7 - aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 - aes${dir} $rndkey1,$inout6 - aes${dir} $rndkey1,$inout7 - $movkey -16($key,%rax),$rndkey0 - jmp .L${dir}_loop8_enter + $movkey ($key,%rax),$rndkey0 + add \$16,%rax + jmp .L${dir}_loop8_inner .align 16 .L${dir}_loop8: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 +.L${dir}_loop8_inner: aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 @@ -587,15 +585,15 @@ aesni_ecb_encrypt: ___ $code.=<<___ if ($win64); lea -0x58(%rsp),%rsp - movaps %xmm6,(%rsp) + movaps %xmm6,(%rsp) # offload $inout4..7 movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) .Lecb_enc_body: ___ $code.=<<___; - and \$-16,$len - jz .Lecb_ret + and \$-16,$len # if ($len<16) + jz .Lecb_ret # return mov 240($key),$rounds # key->rounds $movkey ($key),$rndkey0 @@ -604,10 +602,10 @@ $code.=<<___; test %r8d,%r8d # 5th argument jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# - cmp \$0x80,$len - jb .Lecb_enc_tail + cmp \$0x80,$len # if ($len<8*16) + jb .Lecb_enc_tail # short input - movdqu ($inp),$inout0 + movdqu ($inp),$inout0 # load 8 input blocks movdqu 0x10($inp),$inout1 movdqu 0x20($inp),$inout2 movdqu 0x30($inp),$inout3 @@ -615,14 +613,14 @@ $code.=<<___; movdqu 0x50($inp),$inout5 movdqu 0x60($inp),$inout6 movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp - sub \$0x80,$len + lea 0x80($inp),$inp # $inp+=8*16 + sub \$0x80,$len # $len-=8*16 (can be zero) jmp .Lecb_enc_loop8_enter .align 16 .Lecb_enc_loop8: - movups $inout0,($out) + movups $inout0,($out) # store 8 output blocks mov $key_,$key # restore $key - movdqu ($inp),$inout0 + movdqu ($inp),$inout0 # load 8 input blocks mov $rnds_,$rounds # restore $rounds movups $inout1,0x10($out) movdqu 0x10($inp),$inout1 @@ -637,17 +635,17 @@ $code.=<<___; movups $inout6,0x60($out) movdqu 0x60($inp),$inout6 movups $inout7,0x70($out) - lea 0x80($out),$out + lea 0x80($out),$out # $out+=8*16 movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp + lea 0x80($inp),$inp # $inp+=8*16 .Lecb_enc_loop8_enter: call _aesni_encrypt8 sub \$0x80,$len - jnc .Lecb_enc_loop8 + jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow - movups $inout0,($out) + movups $inout0,($out) # store 8 output blocks mov $key_,$key # restore $key movups $inout1,0x10($out) mov $rnds_,$rounds # restore $rounds @@ -657,11 +655,11 @@ $code.=<<___; movups $inout5,0x50($out) movups $inout6,0x60($out) movups $inout7,0x70($out) - lea 0x80($out),$out - add \$0x80,$len - jz .Lecb_ret + lea 0x80($out),$out # $out+=8*16 + add \$0x80,$len # restore real remaining $len + jz .Lecb_ret # done if ($len==0) -.Lecb_enc_tail: +.Lecb_enc_tail: # $len is less than 8*16 movups ($inp),$inout0 cmp \$0x20,$len jb .Lecb_enc_one @@ -678,8 +676,9 @@ $code.=<<___; movups 0x50($inp),$inout5 je .Lecb_enc_six movdqu 0x60($inp),$inout6 + xorps $inout7,$inout7 call _aesni_encrypt8 - movups $inout0,($out) + movups $inout0,($out) # store 7 output blocks movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) @@ -692,25 +691,25 @@ $code.=<<___; ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; - movups $inout0,($out) + movups $inout0,($out) # store one output block jmp .Lecb_ret .align 16 .Lecb_enc_two: call _aesni_encrypt2 - movups $inout0,($out) + movups $inout0,($out) # store 2 output blocks movups $inout1,0x10($out) jmp .Lecb_ret .align 16 .Lecb_enc_three: call _aesni_encrypt3 - movups $inout0,($out) + movups $inout0,($out) # store 3 output blocks movups $inout1,0x10($out) movups $inout2,0x20($out) jmp .Lecb_ret .align 16 .Lecb_enc_four: call _aesni_encrypt4 - movups $inout0,($out) + movups $inout0,($out) # store 4 output blocks movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) @@ -719,7 +718,7 @@ $code.=<<___; .Lecb_enc_five: xorps $inout5,$inout5 call _aesni_encrypt6 - movups $inout0,($out) + movups $inout0,($out) # store 5 output blocks movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) @@ -728,7 +727,7 @@ $code.=<<___; .align 16 .Lecb_enc_six: call _aesni_encrypt6 - movups $inout0,($out) + movups $inout0,($out) # store 6 output blocks movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) @@ -738,10 +737,10 @@ $code.=<<___; #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: - cmp \$0x80,$len - jb .Lecb_dec_tail + cmp \$0x80,$len # if ($len<8*16) + jb .Lecb_dec_tail # short input - movdqu ($inp),$inout0 + movdqu ($inp),$inout0 # load 8 input blocks movdqu 0x10($inp),$inout1 movdqu 0x20($inp),$inout2 movdqu 0x30($inp),$inout3 @@ -749,14 +748,14 @@ $code.=<<___; movdqu 0x50($inp),$inout5 movdqu 0x60($inp),$inout6 movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp - sub \$0x80,$len + lea 0x80($inp),$inp # $inp+=8*16 + sub \$0x80,$len # $len-=8*16 (can be zero) jmp .Lecb_dec_loop8_enter .align 16 .Lecb_dec_loop8: - movups $inout0,($out) + movups $inout0,($out) # store 8 output blocks mov $key_,$key # restore $key - movdqu ($inp),$inout0 + movdqu ($inp),$inout0 # load 8 input blocks mov $rnds_,$rounds # restore $rounds movups $inout1,0x10($out) movdqu 0x10($inp),$inout1 @@ -771,30 +770,38 @@ $code.=<<___; movups $inout6,0x60($out) movdqu 0x60($inp),$inout6 movups $inout7,0x70($out) - lea 0x80($out),$out + lea 0x80($out),$out # $out+=8*16 movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp + lea 0x80($inp),$inp # $inp+=8*16 .Lecb_dec_loop8_enter: call _aesni_decrypt8 $movkey ($key_),$rndkey0 sub \$0x80,$len - jnc .Lecb_dec_loop8 + jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow - movups $inout0,($out) + movups $inout0,($out) # store 8 output blocks + pxor $inout0,$inout0 # clear register bank mov $key_,$key # restore $key movups $inout1,0x10($out) + pxor $inout1,$inout1 mov $rnds_,$rounds # restore $rounds movups $inout2,0x20($out) + pxor $inout2,$inout2 movups $inout3,0x30($out) + pxor $inout3,$inout3 movups $inout4,0x40($out) + pxor $inout4,$inout4 movups $inout5,0x50($out) + pxor $inout5,$inout5 movups $inout6,0x60($out) + pxor $inout6,$inout6 movups $inout7,0x70($out) - lea 0x80($out),$out - add \$0x80,$len - jz .Lecb_ret + pxor $inout7,$inout7 + lea 0x80($out),$out # $out+=8*16 + add \$0x80,$len # restore real remaining $len + jz .Lecb_ret # done if ($len==0) .Lecb_dec_tail: movups ($inp),$inout0 @@ -814,70 +821,107 @@ $code.=<<___; je .Lecb_dec_six movups 0x60($inp),$inout6 $movkey ($key),$rndkey0 + xorps $inout7,$inout7 call _aesni_decrypt8 - movups $inout0,($out) + movups $inout0,($out) # store 7 output blocks + pxor $inout0,$inout0 # clear register bank movups $inout1,0x10($out) + pxor $inout1,$inout1 movups $inout2,0x20($out) + pxor $inout2,$inout2 movups $inout3,0x30($out) + pxor $inout3,$inout3 movups $inout4,0x40($out) + pxor $inout4,$inout4 movups $inout5,0x50($out) + pxor $inout5,$inout5 movups $inout6,0x60($out) + pxor $inout6,$inout6 + pxor $inout7,$inout7 jmp .Lecb_ret .align 16 .Lecb_dec_one: ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; - movups $inout0,($out) + movups $inout0,($out) # store one output block + pxor $inout0,$inout0 # clear register bank jmp .Lecb_ret .align 16 .Lecb_dec_two: call _aesni_decrypt2 - movups $inout0,($out) + movups $inout0,($out) # store 2 output blocks + pxor $inout0,$inout0 # clear register bank movups $inout1,0x10($out) + pxor $inout1,$inout1 jmp .Lecb_ret .align 16 .Lecb_dec_three: call _aesni_decrypt3 - movups $inout0,($out) + movups $inout0,($out) # store 3 output blocks + pxor $inout0,$inout0 # clear register bank movups $inout1,0x10($out) + pxor $inout1,$inout1 movups $inout2,0x20($out) + pxor $inout2,$inout2 jmp .Lecb_ret .align 16 .Lecb_dec_four: call _aesni_decrypt4 - movups $inout0,($out) + movups $inout0,($out) # store 4 output blocks + pxor $inout0,$inout0 # clear register bank movups $inout1,0x10($out) + pxor $inout1,$inout1 movups $inout2,0x20($out) + pxor $inout2,$inout2 movups $inout3,0x30($out) + pxor $inout3,$inout3 jmp .Lecb_ret .align 16 .Lecb_dec_five: xorps $inout5,$inout5 call _aesni_decrypt6 - movups $inout0,($out) + movups $inout0,($out) # store 5 output blocks + pxor $inout0,$inout0 # clear register bank movups $inout1,0x10($out) + pxor $inout1,$inout1 movups $inout2,0x20($out) + pxor $inout2,$inout2 movups $inout3,0x30($out) + pxor $inout3,$inout3 movups $inout4,0x40($out) + pxor $inout4,$inout4 + pxor $inout5,$inout5 jmp .Lecb_ret .align 16 .Lecb_dec_six: call _aesni_decrypt6 - movups $inout0,($out) + movups $inout0,($out) # store 6 output blocks + pxor $inout0,$inout0 # clear register bank movups $inout1,0x10($out) + pxor $inout1,$inout1 movups $inout2,0x20($out) + pxor $inout2,$inout2 movups $inout3,0x30($out) + pxor $inout3,$inout3 movups $inout4,0x40($out) + pxor $inout4,$inout4 movups $inout5,0x50($out) + pxor $inout5,$inout5 .Lecb_ret: + xorps $rndkey0,$rndkey0 # %xmm0 + pxor $rndkey1,$rndkey1 ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 + movaps %xmm0,(%rsp) # clear stack movaps 0x10(%rsp),%xmm7 + movaps %xmm0,0x10(%rsp) movaps 0x20(%rsp),%xmm8 + movaps %xmm0,0x20(%rsp) movaps 0x30(%rsp),%xmm9 + movaps %xmm0,0x30(%rsp) lea 0x58(%rsp),%rsp .Lecb_enc_ret: ___ @@ -911,10 +955,10 @@ aesni_ccm64_encrypt_blocks: ___ $code.=<<___ if ($win64); lea -0x58(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - movaps %xmm8,0x20(%rsp) - movaps %xmm9,0x30(%rsp) + movaps %xmm6,(%rsp) # $iv + movaps %xmm7,0x10(%rsp) # $bswap_mask + movaps %xmm8,0x20(%rsp) # $in0 + movaps %xmm9,0x30(%rsp) # $increment .Lccm64_enc_body: ___ $code.=<<___; @@ -956,7 +1000,7 @@ $code.=<<___; aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 paddq $increment,$iv - dec $len + dec $len # $len-- ($len is in blocks) aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 @@ -965,16 +1009,26 @@ $code.=<<___; movdqa $iv,$inout0 movups $in0,($out) # save output pshufb $bswap_mask,$inout0 - lea 16($out),$out - jnz .Lccm64_enc_outer + lea 16($out),$out # $out+=16 + jnz .Lccm64_enc_outer # loop if ($len!=0) - movups $inout1,($cmac) + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 + pxor $inout0,$inout0 + movups $inout1,($cmac) # store resulting mac + pxor $inout1,$inout1 + pxor $in0,$in0 + pxor $iv,$iv ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 + movaps %xmm0,(%rsp) # clear stack movaps 0x10(%rsp),%xmm7 + movaps %xmm0,0x10(%rsp) movaps 0x20(%rsp),%xmm8 + movaps %xmm0,0x20(%rsp) movaps 0x30(%rsp),%xmm9 + movaps %xmm0,0x30(%rsp) lea 0x58(%rsp),%rsp .Lccm64_enc_ret: ___ @@ -991,10 +1045,10 @@ aesni_ccm64_decrypt_blocks: ___ $code.=<<___ if ($win64); lea -0x58(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - movaps %xmm8,0x20(%rsp) - movaps %xmm9,0x30(%rsp) + movaps %xmm6,(%rsp) # $iv + movaps %xmm7,0x10(%rsp) # $bswap_mask + movaps %xmm8,0x20(%rsp) # $in8 + movaps %xmm9,0x30(%rsp) # $increment .Lccm64_dec_body: ___ $code.=<<___; @@ -1015,7 +1069,7 @@ $code.=<<___; mov \$16,$rounds movups ($inp),$in0 # load inp paddq $increment,$iv - lea 16($inp),$inp + lea 16($inp),$inp # $inp+=16 sub %r10,%rax # twisted $rounds lea 32($key_,$rnds_),$key # end of key schedule mov %rax,%r10 @@ -1025,11 +1079,11 @@ $code.=<<___; xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 movups $in0,($out) # save output - lea 16($out),$out + lea 16($out),$out # $out+=16 pshufb $bswap_mask,$inout0 - sub \$1,$len - jz .Lccm64_dec_break + sub \$1,$len # $len-- ($len is in blocks) + jz .Lccm64_dec_break # if ($len==0) break $movkey ($key_),$rndkey0 mov %r10,%rax @@ -1049,13 +1103,13 @@ $code.=<<___; aesenc $rndkey0,$inout1 $movkey -16($key,%rax),$rndkey0 jnz .Lccm64_dec2_loop - movups ($inp),$in0 # load inp + movups ($inp),$in0 # load input paddq $increment,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 - lea 16($inp),$inp + lea 16($inp),$inp # $inp+=16 jmp .Lccm64_dec_outer .align 16 @@ -1065,13 +1119,23 @@ $code.=<<___; ___ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); $code.=<<___; - movups $inout1,($cmac) + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 + pxor $inout0,$inout0 + movups $inout1,($cmac) # store resulting mac + pxor $inout1,$inout1 + pxor $in0,$in0 + pxor $iv,$iv ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 + movaps %xmm0,(%rsp) # clear stack movaps 0x10(%rsp),%xmm7 + movaps %xmm0,0x10(%rsp) movaps 0x20(%rsp),%xmm8 + movaps %xmm0,0x20(%rsp) movaps 0x30(%rsp),%xmm9 + movaps %xmm0,0x30(%rsp) lea 0x58(%rsp),%rsp .Lccm64_dec_ret: ___ @@ -1102,13 +1166,34 @@ $code.=<<___; .type aesni_ctr32_encrypt_blocks,\@function,5 .align 16 aesni_ctr32_encrypt_blocks: + cmp \$1,$len + jne .Lctr32_bulk + + # handle single block without allocating stack frame, + # useful when handling edges + movups ($ivp),$inout0 + movups ($inp),$inout1 + mov 240($key),%edx # key->rounds +___ + &aesni_generate1("enc",$key,"%edx"); +$code.=<<___; + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 + xorps $inout1,$inout0 + pxor $inout1,$inout1 + movups $inout0,($out) + xorps $inout0,$inout0 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: lea (%rsp),%rax push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) + movaps %xmm6,-0xa8(%rax) # offload everything movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) @@ -1123,8 +1208,8 @@ ___ $code.=<<___; lea -8(%rax),%rbp - cmp \$1,$len - je .Lctr32_one_shortcut + # 8 16-byte words on top of stack are counter values + # xor-ed with zero-round key movdqu ($ivp),$inout0 movdqu ($key),$rndkey0 @@ -1139,7 +1224,7 @@ $code.=<<___; movdqa $inout0,0x40(%rsp) movdqa $inout0,0x50(%rsp) movdqa $inout0,0x60(%rsp) - mov %rdx,%r10 # borrow %rdx + mov %rdx,%r10 # about to borrow %rdx movdqa $inout0,0x70(%rsp) lea 1($ctr),%rax @@ -1183,15 +1268,15 @@ $code.=<<___; movdqa 0x40(%rsp),$inout4 movdqa 0x50(%rsp),$inout5 - cmp \$8,$len - jb .Lctr32_tail + cmp \$8,$len # $len is in blocks + jb .Lctr32_tail # short input if ($len<8) - sub \$6,$len + sub \$6,$len # $len is biased by -6 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE - je .Lctr32_6x + je .Lctr32_6x # [which denotes Atom Silvermont] lea 0x80($key),$key # size optimization - sub \$2,$len + sub \$2,$len # $len is biased by -8 jmp .Lctr32_loop8 .align 16 @@ -1205,13 +1290,13 @@ $code.=<<___; .align 16 .Lctr32_loop6: - add \$6,$ctr + add \$6,$ctr # next counter value $movkey -48($key,$rnds_),$rndkey0 aesenc $rndkey1,$inout0 mov $ctr,%eax xor $key0,%eax aesenc $rndkey1,$inout1 - movbe %eax,`0x00+12`(%rsp) + movbe %eax,`0x00+12`(%rsp) # store next counter value lea 1($ctr),%eax aesenc $rndkey1,$inout2 xor $key0,%eax @@ -1244,16 +1329,16 @@ $code.=<<___; call .Lenc_loop6 - movdqu ($inp),$inout6 + movdqu ($inp),$inout6 # load 6 input blocks movdqu 0x10($inp),$inout7 movdqu 0x20($inp),$in0 movdqu 0x30($inp),$in1 movdqu 0x40($inp),$in2 movdqu 0x50($inp),$in3 - lea 0x60($inp),$inp + lea 0x60($inp),$inp # $inp+=6*16 $movkey -64($key,$rnds_),$rndkey1 - pxor $inout0,$inout6 - movaps 0x00(%rsp),$inout0 + pxor $inout0,$inout6 # inp^=E(ctr) + movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] pxor $inout1,$inout7 movaps 0x10(%rsp),$inout1 pxor $inout2,$in0 @@ -1264,19 +1349,19 @@ $code.=<<___; movaps 0x40(%rsp),$inout4 pxor $inout5,$in3 movaps 0x50(%rsp),$inout5 - movdqu $inout6,($out) + movdqu $inout6,($out) # store 6 output blocks movdqu $inout7,0x10($out) movdqu $in0,0x20($out) movdqu $in1,0x30($out) movdqu $in2,0x40($out) movdqu $in3,0x50($out) - lea 0x60($out),$out - + lea 0x60($out),$out # $out+=6*16 + sub \$6,$len - jnc .Lctr32_loop6 + jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow - add \$6,$len - jz .Lctr32_done + add \$6,$len # restore real remaining $len + jz .Lctr32_done # done if ($len==0) lea -48($rnds_),$rounds lea -80($key,$rnds_),$key # restore $key @@ -1286,7 +1371,7 @@ $code.=<<___; .align 32 .Lctr32_loop8: - add \$8,$ctr + add \$8,$ctr # next counter value movdqa 0x60(%rsp),$inout6 aesenc $rndkey1,$inout0 mov $ctr,%r9d @@ -1298,7 +1383,7 @@ $code.=<<___; xor $key0,%r9d nop aesenc $rndkey1,$inout3 - mov %r9d,0x00+12(%rsp) + mov %r9d,0x00+12(%rsp) # store next counter value lea 1($ctr),%r9 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 @@ -1331,7 +1416,7 @@ $code.=<<___; aesenc $rndkey0,$inout1 aesenc $rndkey0,$inout2 xor $key0,%r9d - movdqu 0x00($inp),$in0 + movdqu 0x00($inp),$in0 # start loading input aesenc $rndkey0,$inout3 mov %r9d,0x70+12(%rsp) cmp \$11,$rounds @@ -1388,7 +1473,7 @@ $code.=<<___; .align 16 .Lctr32_enc_done: movdqu 0x10($inp),$in1 - pxor $rndkey0,$in0 + pxor $rndkey0,$in0 # input^=round[last] movdqu 0x20($inp),$in2 pxor $rndkey0,$in1 movdqu 0x30($inp),$in3 @@ -1406,11 +1491,11 @@ $code.=<<___; aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 - movdqu 0x60($inp),$rndkey1 - lea 0x80($inp),$inp + movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] + lea 0x80($inp),$inp # $inp+=8*16 - aesenclast $in0,$inout0 - pxor $rndkey0,$rndkey1 + aesenclast $in0,$inout0 # $inN is inp[N]^round[last] + pxor $rndkey0,$rndkey1 # borrowed $rndkey movdqu 0x70-0x80($inp),$in0 aesenclast $in1,$inout1 pxor $rndkey0,$in0 @@ -1425,10 +1510,10 @@ $code.=<<___; movdqa 0x40(%rsp),$in5 aesenclast $rndkey1,$inout6 movdqa 0x50(%rsp),$rndkey0 - $movkey 0x10-0x80($key),$rndkey1 + $movkey 0x10-0x80($key),$rndkey1#real 1st-round key aesenclast $in0,$inout7 - movups $inout0,($out) # store output + movups $inout0,($out) # store 8 output blocks movdqa $in1,$inout0 movups $inout1,0x10($out) movdqa $in2,$inout1 @@ -1442,21 +1527,24 @@ $code.=<<___; movdqa $rndkey0,$inout5 movups $inout6,0x60($out) movups $inout7,0x70($out) - lea 0x80($out),$out - + lea 0x80($out),$out # $out+=8*16 + sub \$8,$len - jnc .Lctr32_loop8 + jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow - add \$8,$len - jz .Lctr32_done + add \$8,$len # restore real remainig $len + jz .Lctr32_done # done if ($len==0) lea -0x80($key),$key .Lctr32_tail: + # note that at this point $inout0..5 are populated with + # counter values xor-ed with 0-round key lea 16($key),$key cmp \$4,$len jb .Lctr32_loop3 je .Lctr32_loop4 + # if ($len>4) compute 7 E(counter) shl \$4,$rounds movdqa 0x60(%rsp),$inout6 pxor $inout7,$inout7 @@ -1464,14 +1552,14 @@ $code.=<<___; $movkey 16($key),$rndkey0 aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 - lea 32-16($key,$rounds),$key + lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter neg %rax aesenc $rndkey1,$inout2 - add \$16,%rax + add \$16,%rax # prepare for .Lenc_loop8_enter movups ($inp),$in0 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 - movups 0x10($inp),$in1 + movups 0x10($inp),$in1 # pre-load input movups 0x20($inp),$in2 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 @@ -1482,7 +1570,7 @@ $code.=<<___; pxor $in0,$inout0 movdqu 0x40($inp),$in0 pxor $in1,$inout1 - movdqu $inout0,($out) + movdqu $inout0,($out) # store output pxor $in2,$inout2 movdqu $inout1,0x10($out) pxor $in3,$inout3 @@ -1491,17 +1579,17 @@ $code.=<<___; movdqu $inout3,0x30($out) movdqu $inout4,0x40($out) cmp \$6,$len - jb .Lctr32_done + jb .Lctr32_done # $len was 5, stop store movups 0x50($inp),$in1 xorps $in1,$inout5 movups $inout5,0x50($out) - je .Lctr32_done + je .Lctr32_done # $len was 6, stop store movups 0x60($inp),$in2 xorps $in2,$inout6 movups $inout6,0x60($out) - jmp .Lctr32_done + jmp .Lctr32_done # $len was 7, stop store .align 32 .Lctr32_loop4: @@ -1515,7 +1603,7 @@ $code.=<<___; jnz .Lctr32_loop4 aesenclast $rndkey1,$inout0 aesenclast $rndkey1,$inout1 - movups ($inp),$in0 + movups ($inp),$in0 # load input movups 0x10($inp),$in1 aesenclast $rndkey1,$inout2 aesenclast $rndkey1,$inout3 @@ -1523,14 +1611,14 @@ $code.=<<___; movups 0x30($inp),$in3 xorps $in0,$inout0 - movups $inout0,($out) + movups $inout0,($out) # store output xorps $in1,$inout1 movups $inout1,0x10($out) pxor $in2,$inout2 movdqu $inout2,0x20($out) pxor $in3,$inout3 movdqu $inout3,0x30($out) - jmp .Lctr32_done + jmp .Lctr32_done # $len was 4, stop store .align 32 .Lctr32_loop3: @@ -1545,48 +1633,79 @@ $code.=<<___; aesenclast $rndkey1,$inout1 aesenclast $rndkey1,$inout2 - movups ($inp),$in0 + movups ($inp),$in0 # load input xorps $in0,$inout0 - movups $inout0,($out) + movups $inout0,($out) # store output cmp \$2,$len - jb .Lctr32_done + jb .Lctr32_done # $len was 1, stop store movups 0x10($inp),$in1 xorps $in1,$inout1 movups $inout1,0x10($out) - je .Lctr32_done + je .Lctr32_done # $len was 2, stop store movups 0x20($inp),$in2 xorps $in2,$inout2 - movups $inout2,0x20($out) - jmp .Lctr32_done - -.align 16 -.Lctr32_one_shortcut: - movups ($ivp),$inout0 - movups ($inp),$in0 - mov 240($key),$rounds # key->rounds -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - xorps $in0,$inout0 - movups $inout0,($out) - jmp .Lctr32_done + movups $inout2,0x20($out) # $len was 3, stop store -.align 16 .Lctr32_done: + xorps %xmm0,%xmm0 # clear regiser bank + xor $key0,$key0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0x00(%rsp) # clear stack + pxor %xmm8,%xmm8 + movaps %xmm0,0x10(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,0x20(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,0x30(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,0x40(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,0x50(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,0x60(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,0x70(%rsp) + pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); movaps -0xa0(%rbp),%xmm6 + movaps %xmm0,-0xa0(%rbp) # clear stack movaps -0x90(%rbp),%xmm7 + movaps %xmm0,-0x90(%rbp) movaps -0x80(%rbp),%xmm8 + movaps %xmm0,-0x80(%rbp) movaps -0x70(%rbp),%xmm9 + movaps %xmm0,-0x70(%rbp) movaps -0x60(%rbp),%xmm10 + movaps %xmm0,-0x60(%rbp) movaps -0x50(%rbp),%xmm11 + movaps %xmm0,-0x50(%rbp) movaps -0x40(%rbp),%xmm12 + movaps %xmm0,-0x40(%rbp) movaps -0x30(%rbp),%xmm13 + movaps %xmm0,-0x30(%rbp) movaps -0x20(%rbp),%xmm14 + movaps %xmm0,-0x20(%rbp) movaps -0x10(%rbp),%xmm15 + movaps %xmm0,-0x10(%rbp) + movaps %xmm0,0x00(%rsp) + movaps %xmm0,0x10(%rsp) + movaps %xmm0,0x20(%rsp) + movaps %xmm0,0x30(%rsp) + movaps %xmm0,0x40(%rsp) + movaps %xmm0,0x50(%rsp) + movaps %xmm0,0x60(%rsp) + movaps %xmm0,0x70(%rsp) ___ $code.=<<___; lea (%rbp),%rsp @@ -1619,7 +1738,7 @@ aesni_xts_encrypt: and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) + movaps %xmm6,-0xa8(%rax) # offload everything movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) @@ -1679,7 +1798,7 @@ $code.=<<___; movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] sub \$16*6,$len - jc .Lxts_enc_short + jc .Lxts_enc_short # if $len-=6*16 borrowed mov \$16+96,$rounds lea 32($key_,$rnds_),$key # end of key schedule @@ -1694,7 +1813,7 @@ $code.=<<___; movdqu `16*0`($inp),$inout0 # load input movdqa $rndkey0,$twmask movdqu `16*1`($inp),$inout1 - pxor @tweak[0],$inout0 + pxor @tweak[0],$inout0 # input^=tweak^round[0] movdqu `16*2`($inp),$inout2 pxor @tweak[1],$inout1 aesenc $rndkey1,$inout0 @@ -1713,10 +1832,10 @@ $code.=<<___; lea `16*6`($inp),$inp pxor $twmask,$inout5 - pxor $twres,@tweak[0] + pxor $twres,@tweak[0] # calclulate tweaks^round[last] aesenc $rndkey1,$inout4 pxor $twres,@tweak[1] - movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] aesenc $rndkey1,$inout5 $movkey 48($key_),$rndkey1 pxor $twres,@tweak[2] @@ -1757,7 +1876,7 @@ $code.=<<___; $movkey -80($key,%rax),$rndkey0 jnz .Lxts_enc_loop6 - movdqa (%r8),$twmask + movdqa (%r8),$twmask # start calculating next tweak movdqa $twres,$twtmp paddd $twres,$twres aesenc $rndkey1,$inout0 @@ -1851,15 +1970,15 @@ $code.=<<___; aesenclast `16*5`(%rsp),$inout5 pxor $twres,@tweak[5] - lea `16*6`($out),$out - movups $inout0,`-16*6`($out) # write output + lea `16*6`($out),$out # $out+=6*16 + movups $inout0,`-16*6`($out) # store 6 output blocks movups $inout1,`-16*5`($out) movups $inout2,`-16*4`($out) movups $inout3,`-16*3`($out) movups $inout4,`-16*2`($out) movups $inout5,`-16*1`($out) sub \$16*6,$len - jnc .Lxts_enc_grandloop + jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow mov \$16+96,$rounds sub $rnds_,$rounds @@ -1867,34 +1986,36 @@ $code.=<<___; shr \$4,$rounds # restore original value .Lxts_enc_short: + # at the point @tweak[0..5] are populated with tweak values mov $rounds,$rnds_ # backup $rounds pxor $rndkey0,@tweak[0] - add \$16*6,$len - jz .Lxts_enc_done + add \$16*6,$len # restore real remaining $len + jz .Lxts_enc_done # done if ($len==0) pxor $rndkey0,@tweak[1] cmp \$0x20,$len - jb .Lxts_enc_one + jb .Lxts_enc_one # $len is 1*16 pxor $rndkey0,@tweak[2] - je .Lxts_enc_two + je .Lxts_enc_two # $len is 2*16 pxor $rndkey0,@tweak[3] cmp \$0x40,$len - jb .Lxts_enc_three + jb .Lxts_enc_three # $len is 3*16 pxor $rndkey0,@tweak[4] - je .Lxts_enc_four + je .Lxts_enc_four # $len is 4*16 - movdqu ($inp),$inout0 + movdqu ($inp),$inout0 # $len is 5*16 movdqu 16*1($inp),$inout1 movdqu 16*2($inp),$inout2 pxor @tweak[0],$inout0 movdqu 16*3($inp),$inout3 pxor @tweak[1],$inout1 movdqu 16*4($inp),$inout4 - lea 16*5($inp),$inp + lea 16*5($inp),$inp # $inp+=5*16 pxor @tweak[2],$inout2 pxor @tweak[3],$inout3 pxor @tweak[4],$inout4 + pxor $inout5,$inout5 call _aesni_encrypt6 @@ -1902,35 +2023,35 @@ $code.=<<___; movdqa @tweak[5],@tweak[0] xorps @tweak[1],$inout1 xorps @tweak[2],$inout2 - movdqu $inout0,($out) + movdqu $inout0,($out) # store 5 output blocks xorps @tweak[3],$inout3 movdqu $inout1,16*1($out) xorps @tweak[4],$inout4 movdqu $inout2,16*2($out) movdqu $inout3,16*3($out) movdqu $inout4,16*4($out) - lea 16*5($out),$out + lea 16*5($out),$out # $out+=5*16 jmp .Lxts_enc_done .align 16 .Lxts_enc_one: movups ($inp),$inout0 - lea 16*1($inp),$inp + lea 16*1($inp),$inp # inp+=1*16 xorps @tweak[0],$inout0 ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; xorps @tweak[0],$inout0 movdqa @tweak[1],@tweak[0] - movups $inout0,($out) - lea 16*1($out),$out + movups $inout0,($out) # store one output block + lea 16*1($out),$out # $out+=1*16 jmp .Lxts_enc_done .align 16 .Lxts_enc_two: movups ($inp),$inout0 movups 16($inp),$inout1 - lea 32($inp),$inp + lea 32($inp),$inp # $inp+=2*16 xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 @@ -1939,9 +2060,9 @@ $code.=<<___; xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] xorps @tweak[1],$inout1 - movups $inout0,($out) + movups $inout0,($out) # store 2 output blocks movups $inout1,16*1($out) - lea 16*2($out),$out + lea 16*2($out),$out # $out+=2*16 jmp .Lxts_enc_done .align 16 @@ -1949,7 +2070,7 @@ $code.=<<___; movups ($inp),$inout0 movups 16*1($inp),$inout1 movups 16*2($inp),$inout2 - lea 16*3($inp),$inp + lea 16*3($inp),$inp # $inp+=3*16 xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 xorps @tweak[2],$inout2 @@ -1960,10 +2081,10 @@ $code.=<<___; movdqa @tweak[3],@tweak[0] xorps @tweak[1],$inout1 xorps @tweak[2],$inout2 - movups $inout0,($out) + movups $inout0,($out) # store 3 output blocks movups $inout1,16*1($out) movups $inout2,16*2($out) - lea 16*3($out),$out + lea 16*3($out),$out # $out+=3*16 jmp .Lxts_enc_done .align 16 @@ -1973,7 +2094,7 @@ $code.=<<___; movups 16*2($inp),$inout2 xorps @tweak[0],$inout0 movups 16*3($inp),$inout3 - lea 16*4($inp),$inp + lea 16*4($inp),$inp # $inp+=4*16 xorps @tweak[1],$inout1 xorps @tweak[2],$inout2 xorps @tweak[3],$inout3 @@ -1984,17 +2105,17 @@ $code.=<<___; movdqa @tweak[4],@tweak[0] pxor @tweak[1],$inout1 pxor @tweak[2],$inout2 - movdqu $inout0,($out) + movdqu $inout0,($out) # store 4 output blocks pxor @tweak[3],$inout3 movdqu $inout1,16*1($out) movdqu $inout2,16*2($out) movdqu $inout3,16*3($out) - lea 16*4($out),$out + lea 16*4($out),$out # $out+=4*16 jmp .Lxts_enc_done .align 16 .Lxts_enc_done: - and \$15,$len_ + and \$15,$len_ # see if $len%16 is 0 jz .Lxts_enc_ret mov $len_,$len @@ -2021,18 +2142,60 @@ $code.=<<___; movups $inout0,-16($out) .Lxts_enc_ret: + xorps %xmm0,%xmm0 # clear register bank + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0x00(%rsp) # clear stack + pxor %xmm8,%xmm8 + movaps %xmm0,0x10(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,0x20(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,0x30(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,0x40(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,0x50(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,0x60(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); movaps -0xa0(%rbp),%xmm6 + movaps %xmm0,-0xa0(%rbp) # clear stack movaps -0x90(%rbp),%xmm7 + movaps %xmm0,-0x90(%rbp) movaps -0x80(%rbp),%xmm8 + movaps %xmm0,-0x80(%rbp) movaps -0x70(%rbp),%xmm9 + movaps %xmm0,-0x70(%rbp) movaps -0x60(%rbp),%xmm10 + movaps %xmm0,-0x60(%rbp) movaps -0x50(%rbp),%xmm11 + movaps %xmm0,-0x50(%rbp) movaps -0x40(%rbp),%xmm12 + movaps %xmm0,-0x40(%rbp) movaps -0x30(%rbp),%xmm13 + movaps %xmm0,-0x30(%rbp) movaps -0x20(%rbp),%xmm14 + movaps %xmm0,-0x20(%rbp) movaps -0x10(%rbp),%xmm15 + movaps %xmm0,-0x10(%rbp) + movaps %xmm0,0x00(%rsp) + movaps %xmm0,0x10(%rsp) + movaps %xmm0,0x20(%rsp) + movaps %xmm0,0x30(%rsp) + movaps %xmm0,0x40(%rsp) + movaps %xmm0,0x50(%rsp) + movaps %xmm0,0x60(%rsp) ___ $code.=<<___; lea (%rbp),%rsp @@ -2053,7 +2216,7 @@ aesni_xts_decrypt: and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) + movaps %xmm6,-0xa8(%rax) # offload everything movaps %xmm7,-0x98(%rax) movaps %xmm8,-0x88(%rax) movaps %xmm9,-0x78(%rax) @@ -2116,7 +2279,7 @@ $code.=<<___; movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] sub \$16*6,$len - jc .Lxts_dec_short + jc .Lxts_dec_short # if $len-=6*16 borrowed mov \$16+96,$rounds lea 32($key_,$rnds_),$key # end of key schedule @@ -2131,7 +2294,7 @@ $code.=<<___; movdqu `16*0`($inp),$inout0 # load input movdqa $rndkey0,$twmask movdqu `16*1`($inp),$inout1 - pxor @tweak[0],$inout0 + pxor @tweak[0],$inout0 # intput^=tweak^round[0] movdqu `16*2`($inp),$inout2 pxor @tweak[1],$inout1 aesdec $rndkey1,$inout0 @@ -2150,7 +2313,7 @@ $code.=<<___; lea `16*6`($inp),$inp pxor $twmask,$inout5 - pxor $twres,@tweak[0] + pxor $twres,@tweak[0] # calclulate tweaks^round[last] aesdec $rndkey1,$inout4 pxor $twres,@tweak[1] movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key @@ -2194,7 +2357,7 @@ $code.=<<___; $movkey -80($key,%rax),$rndkey0 jnz .Lxts_dec_loop6 - movdqa (%r8),$twmask + movdqa (%r8),$twmask # start calculating next tweak movdqa $twres,$twtmp paddd $twres,$twres aesdec $rndkey1,$inout0 @@ -2288,15 +2451,15 @@ $code.=<<___; aesdeclast `16*5`(%rsp),$inout5 pxor $twres,@tweak[5] - lea `16*6`($out),$out - movups $inout0,`-16*6`($out) # write output + lea `16*6`($out),$out # $out+=6*16 + movups $inout0,`-16*6`($out) # store 6 output blocks movups $inout1,`-16*5`($out) movups $inout2,`-16*4`($out) movups $inout3,`-16*3`($out) movups $inout4,`-16*2`($out) movups $inout5,`-16*1`($out) sub \$16*6,$len - jnc .Lxts_dec_grandloop + jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow mov \$16+96,$rounds sub $rnds_,$rounds @@ -2304,31 +2467,32 @@ $code.=<<___; shr \$4,$rounds # restore original value .Lxts_dec_short: + # at the point @tweak[0..5] are populated with tweak values mov $rounds,$rnds_ # backup $rounds pxor $rndkey0,@tweak[0] pxor $rndkey0,@tweak[1] - add \$16*6,$len - jz .Lxts_dec_done + add \$16*6,$len # restore real remaining $len + jz .Lxts_dec_done # done if ($len==0) pxor $rndkey0,@tweak[2] cmp \$0x20,$len - jb .Lxts_dec_one + jb .Lxts_dec_one # $len is 1*16 pxor $rndkey0,@tweak[3] - je .Lxts_dec_two + je .Lxts_dec_two # $len is 2*16 pxor $rndkey0,@tweak[4] cmp \$0x40,$len - jb .Lxts_dec_three - je .Lxts_dec_four + jb .Lxts_dec_three # $len is 3*16 + je .Lxts_dec_four # $len is 4*16 - movdqu ($inp),$inout0 + movdqu ($inp),$inout0 # $len is 5*16 movdqu 16*1($inp),$inout1 movdqu 16*2($inp),$inout2 pxor @tweak[0],$inout0 movdqu 16*3($inp),$inout3 pxor @tweak[1],$inout1 movdqu 16*4($inp),$inout4 - lea 16*5($inp),$inp + lea 16*5($inp),$inp # $inp+=5*16 pxor @tweak[2],$inout2 pxor @tweak[3],$inout3 pxor @tweak[4],$inout4 @@ -2338,7 +2502,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 xorps @tweak[2],$inout2 - movdqu $inout0,($out) + movdqu $inout0,($out) # store 5 output blocks xorps @tweak[3],$inout3 movdqu $inout1,16*1($out) xorps @tweak[4],$inout4 @@ -2347,7 +2511,7 @@ $code.=<<___; movdqu $inout3,16*3($out) pcmpgtd @tweak[5],$twtmp movdqu $inout4,16*4($out) - lea 16*5($out),$out + lea 16*5($out),$out # $out+=5*16 pshufd \$0x13,$twtmp,@tweak[1] # $twres and \$15,$len_ jz .Lxts_dec_ret @@ -2361,23 +2525,23 @@ $code.=<<___; .align 16 .Lxts_dec_one: movups ($inp),$inout0 - lea 16*1($inp),$inp + lea 16*1($inp),$inp # $inp+=1*16 xorps @tweak[0],$inout0 ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; xorps @tweak[0],$inout0 movdqa @tweak[1],@tweak[0] - movups $inout0,($out) + movups $inout0,($out) # store one output block movdqa @tweak[2],@tweak[1] - lea 16*1($out),$out + lea 16*1($out),$out # $out+=1*16 jmp .Lxts_dec_done .align 16 .Lxts_dec_two: movups ($inp),$inout0 movups 16($inp),$inout1 - lea 32($inp),$inp + lea 32($inp),$inp # $inp+=2*16 xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 @@ -2387,9 +2551,9 @@ $code.=<<___; movdqa @tweak[2],@tweak[0] xorps @tweak[1],$inout1 movdqa @tweak[3],@tweak[1] - movups $inout0,($out) + movups $inout0,($out) # store 2 output blocks movups $inout1,16*1($out) - lea 16*2($out),$out + lea 16*2($out),$out # $out+=2*16 jmp .Lxts_dec_done .align 16 @@ -2397,7 +2561,7 @@ $code.=<<___; movups ($inp),$inout0 movups 16*1($inp),$inout1 movups 16*2($inp),$inout2 - lea 16*3($inp),$inp + lea 16*3($inp),$inp # $inp+=3*16 xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 xorps @tweak[2],$inout2 @@ -2409,10 +2573,10 @@ $code.=<<___; xorps @tweak[1],$inout1 movdqa @tweak[4],@tweak[1] xorps @tweak[2],$inout2 - movups $inout0,($out) + movups $inout0,($out) # store 3 output blocks movups $inout1,16*1($out) movups $inout2,16*2($out) - lea 16*3($out),$out + lea 16*3($out),$out # $out+=3*16 jmp .Lxts_dec_done .align 16 @@ -2422,7 +2586,7 @@ $code.=<<___; movups 16*2($inp),$inout2 xorps @tweak[0],$inout0 movups 16*3($inp),$inout3 - lea 16*4($inp),$inp + lea 16*4($inp),$inp # $inp+=4*16 xorps @tweak[1],$inout1 xorps @tweak[2],$inout2 xorps @tweak[3],$inout3 @@ -2434,17 +2598,17 @@ $code.=<<___; pxor @tweak[1],$inout1 movdqa @tweak[5],@tweak[1] pxor @tweak[2],$inout2 - movdqu $inout0,($out) + movdqu $inout0,($out) # store 4 output blocks pxor @tweak[3],$inout3 movdqu $inout1,16*1($out) movdqu $inout2,16*2($out) movdqu $inout3,16*3($out) - lea 16*4($out),$out + lea 16*4($out),$out # $out+=4*16 jmp .Lxts_dec_done .align 16 .Lxts_dec_done: - and \$15,$len_ + and \$15,$len_ # see if $len%16 is 0 jz .Lxts_dec_ret .Lxts_dec_done2: mov $len_,$len @@ -2482,18 +2646,60 @@ $code.=<<___; movups $inout0,($out) .Lxts_dec_ret: + xorps %xmm0,%xmm0 # clear register bank + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0x00(%rsp) # clear stack + pxor %xmm8,%xmm8 + movaps %xmm0,0x10(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,0x20(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,0x30(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,0x40(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,0x50(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,0x60(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); movaps -0xa0(%rbp),%xmm6 + movaps %xmm0,-0xa0(%rbp) # clear stack movaps -0x90(%rbp),%xmm7 + movaps %xmm0,-0x90(%rbp) movaps -0x80(%rbp),%xmm8 + movaps %xmm0,-0x80(%rbp) movaps -0x70(%rbp),%xmm9 + movaps %xmm0,-0x70(%rbp) movaps -0x60(%rbp),%xmm10 + movaps %xmm0,-0x60(%rbp) movaps -0x50(%rbp),%xmm11 + movaps %xmm0,-0x50(%rbp) movaps -0x40(%rbp),%xmm12 + movaps %xmm0,-0x40(%rbp) movaps -0x30(%rbp),%xmm13 + movaps %xmm0,-0x30(%rbp) movaps -0x20(%rbp),%xmm14 + movaps %xmm0,-0x20(%rbp) movaps -0x10(%rbp),%xmm15 + movaps %xmm0,-0x10(%rbp) + movaps %xmm0,0x00(%rsp) + movaps %xmm0,0x10(%rsp) + movaps %xmm0,0x20(%rsp) + movaps %xmm0,0x30(%rsp) + movaps %xmm0,0x40(%rsp) + movaps %xmm0,0x50(%rsp) + movaps %xmm0,0x60(%rsp) ___ $code.=<<___; lea (%rbp),%rsp @@ -2548,7 +2754,11 @@ $code.=<<___; jnc .Lcbc_enc_loop add \$16,$len jnz .Lcbc_enc_tail + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 movups $inout0,($ivp) + pxor $inout0,$inout0 + pxor $inout1,$inout1 jmp .Lcbc_ret .Lcbc_enc_tail: @@ -2568,6 +2778,27 @@ $code.=<<___; #--------------------------- CBC DECRYPT ------------------------------# .align 16 .Lcbc_decrypt: + cmp \$16,$len + jne .Lcbc_decrypt_bulk + + # handle single block without allocating stack frame, + # useful in ciphertext stealing mode + movdqu ($inp),$inout0 # load input + movdqu ($ivp),$inout1 # load iv + movdqa $inout0,$inout2 # future iv +___ + &aesni_generate1("dec",$key,$rnds_); +$code.=<<___; + pxor $rndkey0,$rndkey0 # clear register bank + pxor $rndkey1,$rndkey1 + movdqu $inout2,($ivp) # store iv + xorps $inout1,$inout0 # ^=iv + pxor $inout1,$inout1 + movups $inout0,($out) # store output + pxor $inout0,$inout0 + jmp .Lcbc_ret +.align 16 +.Lcbc_decrypt_bulk: lea (%rsp),%rax push %rbp sub \$$frame_size,%rsp @@ -2609,11 +2840,11 @@ $code.=<<___; cmp \$0x70,$len jbe .Lcbc_dec_six_or_seven - and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE - sub \$0x50,$len + and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE + sub \$0x50,$len # $len is biased by -5*16 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE - je .Lcbc_dec_loop6_enter - sub \$0x20,$len + je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] + sub \$0x20,$len # $len is biased by -7*16 lea 0x70($key),$key # size optimization jmp .Lcbc_dec_loop8_enter .align 16 @@ -2740,7 +2971,7 @@ $code.=<<___; movaps $inout7,$inout0 lea -0x70($key),$key add \$0x70,$len - jle .Lcbc_dec_tail_collected + jle .Lcbc_dec_clear_tail_collected movups $inout7,($out) lea 0x10($out),$out cmp \$0x50,$len @@ -2759,14 +2990,19 @@ $code.=<<___; movdqu $inout0,($out) pxor $in1,$inout2 movdqu $inout1,0x10($out) + pxor $inout1,$inout1 # clear register bank pxor $in2,$inout3 movdqu $inout2,0x20($out) + pxor $inout2,$inout2 pxor $in3,$inout4 movdqu $inout3,0x30($out) + pxor $inout3,$inout3 pxor $in4,$inout5 movdqu $inout4,0x40($out) + pxor $inout4,$inout4 lea 0x50($out),$out movdqa $inout5,$inout0 + pxor $inout5,$inout5 jmp .Lcbc_dec_tail_collected .align 16 @@ -2781,16 +3017,23 @@ $code.=<<___; movdqu $inout0,($out) pxor $in1,$inout2 movdqu $inout1,0x10($out) + pxor $inout1,$inout1 # clear register bank pxor $in2,$inout3 movdqu $inout2,0x20($out) + pxor $inout2,$inout2 pxor $in3,$inout4 movdqu $inout3,0x30($out) + pxor $inout3,$inout3 pxor $in4,$inout5 movdqu $inout4,0x40($out) + pxor $inout4,$inout4 pxor $inout7,$inout6 movdqu $inout5,0x50($out) + pxor $inout5,$inout5 lea 0x60($out),$out movdqa $inout6,$inout0 + pxor $inout6,$inout6 + pxor $inout7,$inout7 jmp .Lcbc_dec_tail_collected .align 16 @@ -2834,31 +3077,31 @@ $code.=<<___; movdqa $inout5,$inout0 add \$0x50,$len - jle .Lcbc_dec_tail_collected + jle .Lcbc_dec_clear_tail_collected movups $inout5,($out) lea 0x10($out),$out .Lcbc_dec_tail: movups ($inp),$inout0 sub \$0x10,$len - jbe .Lcbc_dec_one + jbe .Lcbc_dec_one # $len is 1*16 or less movups 0x10($inp),$inout1 movaps $inout0,$in0 sub \$0x10,$len - jbe .Lcbc_dec_two + jbe .Lcbc_dec_two # $len is 2*16 or less movups 0x20($inp),$inout2 movaps $inout1,$in1 sub \$0x10,$len - jbe .Lcbc_dec_three + jbe .Lcbc_dec_three # $len is 3*16 or less movups 0x30($inp),$inout3 movaps $inout2,$in2 sub \$0x10,$len - jbe .Lcbc_dec_four + jbe .Lcbc_dec_four # $len is 4*16 or less - movups 0x40($inp),$inout4 + movups 0x40($inp),$inout4 # $len is 5*16 or less movaps $inout3,$in3 movaps $inout4,$in4 xorps $inout5,$inout5 @@ -2869,12 +3112,17 @@ $code.=<<___; movdqu $inout0,($out) pxor $in1,$inout2 movdqu $inout1,0x10($out) + pxor $inout1,$inout1 # clear register bank pxor $in2,$inout3 movdqu $inout2,0x20($out) + pxor $inout2,$inout2 pxor $in3,$inout4 movdqu $inout3,0x30($out) + pxor $inout3,$inout3 lea 0x40($out),$out movdqa $inout4,$inout0 + pxor $inout4,$inout4 + pxor $inout5,$inout5 sub \$0x10,$len jmp .Lcbc_dec_tail_collected @@ -2896,6 +3144,7 @@ $code.=<<___; pxor $in0,$inout1 movdqu $inout0,($out) movdqa $inout1,$inout0 + pxor $inout1,$inout1 # clear register bank lea 0x10($out),$out jmp .Lcbc_dec_tail_collected .align 16 @@ -2908,7 +3157,9 @@ $code.=<<___; movdqu $inout0,($out) pxor $in1,$inout2 movdqu $inout1,0x10($out) + pxor $inout1,$inout1 # clear register bank movdqa $inout2,$inout0 + pxor $inout2,$inout2 lea 0x20($out),$out jmp .Lcbc_dec_tail_collected .align 16 @@ -2921,41 +3172,71 @@ $code.=<<___; movdqu $inout0,($out) pxor $in1,$inout2 movdqu $inout1,0x10($out) + pxor $inout1,$inout1 # clear register bank pxor $in2,$inout3 movdqu $inout2,0x20($out) + pxor $inout2,$inout2 movdqa $inout3,$inout0 + pxor $inout3,$inout3 lea 0x30($out),$out jmp .Lcbc_dec_tail_collected .align 16 +.Lcbc_dec_clear_tail_collected: + pxor $inout1,$inout1 # clear register bank + pxor $inout2,$inout2 + pxor $inout3,$inout3 +___ +$code.=<<___ if (!$win64); + pxor $inout4,$inout4 # %xmm6..9 + pxor $inout5,$inout5 + pxor $inout6,$inout6 + pxor $inout7,$inout7 +___ +$code.=<<___; .Lcbc_dec_tail_collected: movups $iv,($ivp) and \$15,$len jnz .Lcbc_dec_tail_partial movups $inout0,($out) + pxor $inout0,$inout0 jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: movaps $inout0,(%rsp) + pxor $inout0,$inout0 mov \$16,%rcx mov $out,%rdi sub $len,%rcx lea (%rsp),%rsi - .long 0x9066A4F3 # rep movsb + .long 0x9066A4F3 # rep movsb + movdqa $inout0,(%rsp) .Lcbc_dec_ret: + xorps $rndkey0,$rndkey0 # %xmm0 + pxor $rndkey1,$rndkey1 ___ $code.=<<___ if ($win64); movaps 0x10(%rsp),%xmm6 + movaps %xmm0,0x10(%rsp) # clear stack movaps 0x20(%rsp),%xmm7 + movaps %xmm0,0x20(%rsp) movaps 0x30(%rsp),%xmm8 + movaps %xmm0,0x30(%rsp) movaps 0x40(%rsp),%xmm9 + movaps %xmm0,0x40(%rsp) movaps 0x50(%rsp),%xmm10 + movaps %xmm0,0x50(%rsp) movaps 0x60(%rsp),%xmm11 + movaps %xmm0,0x60(%rsp) movaps 0x70(%rsp),%xmm12 + movaps %xmm0,0x70(%rsp) movaps 0x80(%rsp),%xmm13 + movaps %xmm0,0x80(%rsp) movaps 0x90(%rsp),%xmm14 + movaps %xmm0,0x90(%rsp) movaps 0xa0(%rsp),%xmm15 + movaps %xmm0,0xa0(%rsp) ___ $code.=<<___; lea (%rbp),%rsp @@ -2965,8 +3246,15 @@ $code.=<<___; .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt ___ } -# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, +# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, # int bits, AES_KEY *key) +# +# input: $inp user-supplied key +# $bits $inp length in bits +# $key pointer to key schedule +# output: %eax 0 denoting success, -1 or -2 - failure (see C) +# *$key key schedule +# { my ($inp,$bits,$key) = @_4args; $bits =~ s/%r/%e/; @@ -3003,7 +3291,9 @@ ${PREFIX}_set_decrypt_key: $movkey ($key),%xmm0 # inverse middle aesimc %xmm0,%xmm0 + pxor %xmm1,%xmm1 $movkey %xmm0,($inp) + pxor %xmm0,%xmm0 .Ldec_key_ret: add \$8,%rsp ret @@ -3020,6 +3310,22 @@ ___ # Agressively optimized in respect to aeskeygenassist's critical path # and is contained in %xmm0-5 to meet Win64 ABI requirement. # +# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, +# int bits, AES_KEY * const key); +# +# input: $inp user-supplied key +# $bits $inp length in bits +# $key pointer to key schedule +# output: %eax 0 denoting success, -1 or -2 - failure (see C) +# $bits rounds-1 (used in aesni_set_decrypt_key) +# *$key key schedule +# $key pointer to key schedule (used in +# aesni_set_decrypt_key) +# +# Subroutine is frame-less, which means that only volatile registers +# are used. Note that it's declared "abi-omnipotent", which means that +# amount of volatile registers is smaller on Windows. +# $code.=<<___; .globl ${PREFIX}_set_encrypt_key .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent @@ -3033,9 +3339,11 @@ __aesni_set_encrypt_key: test $key,$key jz .Lenc_key_ret + mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits movups ($inp),%xmm0 # pull first 128 bits of *userKey xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 - lea 16($key),%rax + and OPENSSL_ia32cap_P+4(%rip),%r10d + lea 16($key),%rax # %rax is used as modifiable copy of $key cmp \$256,$bits je .L14rounds cmp \$192,$bits @@ -3045,6 +3353,9 @@ __aesni_set_encrypt_key: .L10rounds: mov \$9,$bits # 10 rounds for 128-bit key + cmp \$`1<<28`,%r10d # AVX, bit no XOP + je .L10rounds_alt + $movkey %xmm0,($key) # round 0 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 call .Lkey_expansion_128_cold @@ -3072,9 +3383,79 @@ __aesni_set_encrypt_key: jmp .Lenc_key_ret .align 16 +.L10rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + mov \$8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,($key) + jmp .Loop_key128 + +.align 16 +.Loop_key128: + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld \$1,%xmm4 + lea 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + dec %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld \$1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + + movdqa %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm2,%xmm3 + pslldq \$4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + mov $bits,96(%rax) # 240($key) + xor %eax,%eax + jmp .Lenc_key_ret + +.align 16 .L12rounds: movq 16($inp),%xmm2 # remaining 1/3 of *userKey mov \$11,$bits # 12 rounds for 192 + cmp \$`1<<28`,%r10d # AVX, but no XOP + je .L12rounds_alt + $movkey %xmm0,($key) # round 0 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 call .Lkey_expansion_192a_cold @@ -3098,10 +3479,54 @@ __aesni_set_encrypt_key: jmp .Lenc_key_ret .align 16 +.L12rounds_alt: + movdqa .Lkey_rotate192(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + mov \$8,%r10d + movdqu %xmm0,($key) + jmp .Loop_key192 + +.align 16 +.Loop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + pslld \$1, %xmm4 + lea 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd \$0xff,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq \$4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + dec %r10d + jnz .Loop_key192 + + mov $bits,32(%rax) # 240($key) + xor %eax,%eax + jmp .Lenc_key_ret + +.align 16 .L14rounds: movups 16($inp),%xmm2 # remaning half of *userKey mov \$13,$bits # 14 rounds for 256 lea 16(%rax),%rax + cmp \$`1<<28`,%r10d # AVX, but no XOP + je .L14rounds_alt + $movkey %xmm0,($key) # round 0 $movkey %xmm2,16($key) # round 1 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 @@ -3136,9 +3561,69 @@ __aesni_set_encrypt_key: jmp .Lenc_key_ret .align 16 +.L14rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + mov \$7,%r10d + movdqu %xmm0,0($key) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16($key) + jmp .Loop_key256 + +.align 16 +.Loop_key256: + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + + movdqa %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm0,%xmm3 + pslldq \$4,%xmm0 + pxor %xmm3,%xmm0 + pslld \$1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + dec %r10d + jz .Ldone_key256 + + pshufd \$0xff,%xmm0,%xmm2 + pxor %xmm3,%xmm3 + aesenclast %xmm3,%xmm2 + + movdqa %xmm1,%xmm3 + pslldq \$4,%xmm1 + pxor %xmm1,%xmm3 + pslldq \$4,%xmm1 + pxor %xmm1,%xmm3 + pslldq \$4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + lea 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + mov $bits,16(%rax) # 240($key) + xor %eax,%eax + jmp .Lenc_key_ret + +.align 16 .Lbad_keybits: mov \$-2,%rax .Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 add \$8,%rsp ret .LSEH_end_set_encrypt_key: @@ -3228,6 +3713,14 @@ $code.=<<___; .long 0x87,0,1,0 .Lincrement1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: + .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: + .long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: + .long 1,1,1,1 +.Lkey_rcon1b: + .long 0x1b,0x1b,0x1b,0x1b .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" .align 64 @@ -3345,7 +3838,7 @@ cbc_se_handler: mov 152($context),%rax # pull context->Rsp mov 248($context),%rbx # pull context->Rip - lea .Lcbc_decrypt(%rip),%r10 + lea .Lcbc_decrypt_bulk(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lcommon_seh_tail diff --git a/src/crypto/aes/asm/aesv8-armx.pl b/src/crypto/aes/asm/aesv8-armx.pl index 703da04..b0916f6 100644 --- a/src/crypto/aes/asm/aesv8-armx.pl +++ b/src/crypto/aes/asm/aesv8-armx.pl @@ -24,11 +24,23 @@ # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 -# Cortex-A53 2.45 1.87 1.94 -# Cortex-A57 3.64 1.34 1.32 +# Cortex-A53 1.32 1.29 1.46 +# Cortex-A57(*) 1.95 0.85 0.93 +# Denver 1.96 0.86 0.80 +# +# (*) original 3.64/1.34/1.32 results were for r0p0 revision +# and are still same even for updated module; $flavour = shift; -open STDOUT,">".shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; $prefix="aes_v8"; @@ -38,10 +50,9 @@ $code=<<___; #if __ARM_MAX_ARCH__>=7 .text ___ - -$code.=<<___ if ($flavour =~ /64/); +$code.=<<___ if ($flavour =~ /64/); #if !defined(__clang__) -.arch armv8-a+crypto +.arch armv8-a+crypto #endif ___ $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/); @@ -61,7 +72,7 @@ my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= $code.=<<___; .align 5 -rcon: +.Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b @@ -90,7 +101,7 @@ $code.=<<___; tst $bits,#0x3f b.ne .Lenc_key_abort - adr $ptr,rcon + adr $ptr,.Lrcon cmp $bits,#192 veor $zero,$zero,$zero @@ -313,17 +324,17 @@ ${prefix}_${dir}crypt: .Loop_${dir}c: aes$e $inout,$rndkey0 - vld1.32 {$rndkey0},[$key],#16 aes$mc $inout,$inout + vld1.32 {$rndkey0},[$key],#16 subs $rounds,$rounds,#2 aes$e $inout,$rndkey1 - vld1.32 {$rndkey1},[$key],#16 aes$mc $inout,$inout + vld1.32 {$rndkey1},[$key],#16 b.gt .Loop_${dir}c aes$e $inout,$rndkey0 - vld1.32 {$rndkey0},[$key] aes$mc $inout,$inout + vld1.32 {$rndkey0},[$key] aes$e $inout,$rndkey1 veor $inout,$inout,$rndkey0 @@ -341,6 +352,7 @@ my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); +my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); ### q8-q15 preloaded key schedule @@ -390,25 +402,50 @@ $code.=<<___; veor $rndzero_n_last,q8,$rndlast b.eq .Lcbc_enc128 + vld1.32 {$in0-$in1},[$key_] + add $key_,$key,#16 + add $key4,$key,#16*4 + add $key5,$key,#16*5 + aese $dat,q8 + aesmc $dat,$dat + add $key6,$key,#16*6 + add $key7,$key,#16*7 + b .Lenter_cbc_enc + +.align 4 .Loop_cbc_enc: aese $dat,q8 - vld1.32 {q8},[$key_],#16 aesmc $dat,$dat - subs $cnt,$cnt,#2 + vst1.8 {$ivec},[$out],#16 +.Lenter_cbc_enc: aese $dat,q9 - vld1.32 {q9},[$key_],#16 aesmc $dat,$dat - b.gt .Loop_cbc_enc + aese $dat,$in0 + aesmc $dat,$dat + vld1.32 {q8},[$key4] + cmp $rounds,#4 + aese $dat,$in1 + aesmc $dat,$dat + vld1.32 {q9},[$key5] + b.eq .Lcbc_enc192 aese $dat,q8 aesmc $dat,$dat + vld1.32 {q8},[$key6] + aese $dat,q9 + aesmc $dat,$dat + vld1.32 {q9},[$key7] + nop + +.Lcbc_enc192: + aese $dat,q8 + aesmc $dat,$dat subs $len,$len,#16 aese $dat,q9 aesmc $dat,$dat cclr $step,eq aese $dat,q10 aesmc $dat,$dat - add $key_,$key,#16 aese $dat,q11 aesmc $dat,$dat vld1.8 {q8},[$inp],$step @@ -417,16 +454,14 @@ $code.=<<___; veor q8,q8,$rndzero_n_last aese $dat,q13 aesmc $dat,$dat - vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vld1.32 {q9},[$key_] // re-pre-load rndkey[1] aese $dat,q14 aesmc $dat,$dat aese $dat,q15 - - mov $cnt,$rounds veor $ivec,$dat,$rndlast - vst1.8 {$ivec},[$out],#16 b.hs .Loop_cbc_enc + vst1.8 {$ivec},[$out],#16 b .Lcbc_done .align 5 @@ -488,79 +523,78 @@ $code.=<<___; .Loop3x_cbc_dec: aesd $dat0,q8 - aesd $dat1,q8 - aesd $dat2,q8 - vld1.32 {q8},[$key_],#16 aesimc $dat0,$dat0 + aesd $dat1,q8 aesimc $dat1,$dat1 + aesd $dat2,q8 aesimc $dat2,$dat2 + vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat0,q9 - aesd $dat1,q9 - aesd $dat2,q9 - vld1.32 {q9},[$key_],#16 aesimc $dat0,$dat0 + aesd $dat1,q9 aesimc $dat1,$dat1 + aesd $dat2,q9 aesimc $dat2,$dat2 + vld1.32 {q9},[$key_],#16 b.gt .Loop3x_cbc_dec aesd $dat0,q8 - aesd $dat1,q8 - aesd $dat2,q8 - veor $tmp0,$ivec,$rndlast aesimc $dat0,$dat0 + aesd $dat1,q8 aesimc $dat1,$dat1 + aesd $dat2,q8 aesimc $dat2,$dat2 + veor $tmp0,$ivec,$rndlast + subs $len,$len,#0x30 veor $tmp1,$in0,$rndlast + mov.lo x6,$len // x6, $cnt, is zero at this point aesd $dat0,q9 - aesd $dat1,q9 - aesd $dat2,q9 - veor $tmp2,$in1,$rndlast - subs $len,$len,#0x30 aesimc $dat0,$dat0 + aesd $dat1,q9 aesimc $dat1,$dat1 + aesd $dat2,q9 aesimc $dat2,$dat2 - vorr $ivec,$in2,$in2 - mov.lo x6,$len // x6, $cnt, is zero at this point - aesd $dat0,q12 - aesd $dat1,q12 - aesd $dat2,q12 + veor $tmp2,$in1,$rndlast add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat2 // are loaded with last "words" + vorr $ivec,$in2,$in2 + mov $key_,$key + aesd $dat0,q12 aesimc $dat0,$dat0 + aesd $dat1,q12 aesimc $dat1,$dat1 + aesd $dat2,q12 aesimc $dat2,$dat2 - mov $key_,$key - aesd $dat0,q13 - aesd $dat1,q13 - aesd $dat2,q13 vld1.8 {$in0},[$inp],#16 + aesd $dat0,q13 aesimc $dat0,$dat0 + aesd $dat1,q13 aesimc $dat1,$dat1 + aesd $dat2,q13 aesimc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 aesd $dat0,q14 - aesd $dat1,q14 - aesd $dat2,q14 - vld1.8 {$in2},[$inp],#16 aesimc $dat0,$dat0 + aesd $dat1,q14 aesimc $dat1,$dat1 + aesd $dat2,q14 aesimc $dat2,$dat2 - vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + vld1.8 {$in2},[$inp],#16 aesd $dat0,q15 aesd $dat1,q15 aesd $dat2,q15 - + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] add $cnt,$rounds,#2 veor $tmp0,$tmp0,$dat0 veor $tmp1,$tmp1,$dat1 veor $dat2,$dat2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] - vorr $dat0,$in0,$in0 vst1.8 {$tmp0},[$out],#16 - vorr $dat1,$in1,$in1 + vorr $dat0,$in0,$in0 vst1.8 {$tmp1},[$out],#16 + vorr $dat1,$in1,$in1 vst1.8 {$dat2},[$out],#16 vorr $dat2,$in2,$in2 b.hs .Loop3x_cbc_dec @@ -571,39 +605,39 @@ $code.=<<___; .Lcbc_dec_tail: aesd $dat1,q8 - aesd $dat2,q8 - vld1.32 {q8},[$key_],#16 aesimc $dat1,$dat1 + aesd $dat2,q8 aesimc $dat2,$dat2 + vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat1,q9 - aesd $dat2,q9 - vld1.32 {q9},[$key_],#16 aesimc $dat1,$dat1 + aesd $dat2,q9 aesimc $dat2,$dat2 + vld1.32 {q9},[$key_],#16 b.gt .Lcbc_dec_tail aesd $dat1,q8 - aesd $dat2,q8 aesimc $dat1,$dat1 + aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat1,q9 - aesd $dat2,q9 aesimc $dat1,$dat1 + aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat1,q12 - aesd $dat2,q12 aesimc $dat1,$dat1 + aesd $dat2,q12 aesimc $dat2,$dat2 cmn $len,#0x20 aesd $dat1,q13 - aesd $dat2,q13 aesimc $dat1,$dat1 + aesd $dat2,q13 aesimc $dat2,$dat2 veor $tmp1,$ivec,$rndlast aesd $dat1,q14 - aesd $dat2,q14 aesimc $dat1,$dat1 + aesd $dat2,q14 aesimc $dat2,$dat2 veor $tmp2,$in1,$rndlast aesd $dat1,q15 @@ -704,70 +738,69 @@ $code.=<<___; .align 4 .Loop3x_ctr32: aese $dat0,q8 - aese $dat1,q8 - aese $dat2,q8 - vld1.32 {q8},[$key_],#16 aesmc $dat0,$dat0 + aese $dat1,q8 aesmc $dat1,$dat1 + aese $dat2,q8 aesmc $dat2,$dat2 + vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 - aese $dat1,q9 - aese $dat2,q9 - vld1.32 {q9},[$key_],#16 aesmc $dat0,$dat0 + aese $dat1,q9 aesmc $dat1,$dat1 + aese $dat2,q9 aesmc $dat2,$dat2 + vld1.32 {q9},[$key_],#16 b.gt .Loop3x_ctr32 aese $dat0,q8 - aese $dat1,q8 - aese $dat2,q8 - mov $key_,$key aesmc $tmp0,$dat0 - vld1.8 {$in0},[$inp],#16 + aese $dat1,q8 aesmc $tmp1,$dat1 - aesmc $dat2,$dat2 + vld1.8 {$in0},[$inp],#16 vorr $dat0,$ivec,$ivec - aese $tmp0,q9 + aese $dat2,q8 + aesmc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 - aese $tmp1,q9 - aese $dat2,q9 vorr $dat1,$ivec,$ivec + aese $tmp0,q9 aesmc $tmp0,$tmp0 - vld1.8 {$in2},[$inp],#16 + aese $tmp1,q9 aesmc $tmp1,$tmp1 + vld1.8 {$in2},[$inp],#16 + mov $key_,$key + aese $dat2,q9 aesmc $tmp2,$dat2 vorr $dat2,$ivec,$ivec add $tctr0,$ctr,#1 aese $tmp0,q12 + aesmc $tmp0,$tmp0 aese $tmp1,q12 - aese $tmp2,q12 + aesmc $tmp1,$tmp1 veor $in0,$in0,$rndlast add $tctr1,$ctr,#2 - aesmc $tmp0,$tmp0 - aesmc $tmp1,$tmp1 + aese $tmp2,q12 aesmc $tmp2,$tmp2 veor $in1,$in1,$rndlast add $ctr,$ctr,#3 aese $tmp0,q13 + aesmc $tmp0,$tmp0 aese $tmp1,q13 - aese $tmp2,q13 + aesmc $tmp1,$tmp1 veor $in2,$in2,$rndlast rev $tctr0,$tctr0 - aesmc $tmp0,$tmp0 - vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] - aesmc $tmp1,$tmp1 + aese $tmp2,q13 aesmc $tmp2,$tmp2 vmov.32 ${dat0}[3], $tctr0 rev $tctr1,$tctr1 aese $tmp0,q14 + aesmc $tmp0,$tmp0 aese $tmp1,q14 - aese $tmp2,q14 + aesmc $tmp1,$tmp1 vmov.32 ${dat1}[3], $tctr1 rev $tctr2,$ctr - aesmc $tmp0,$tmp0 - aesmc $tmp1,$tmp1 + aese $tmp2,q14 aesmc $tmp2,$tmp2 vmov.32 ${dat2}[3], $tctr2 subs $len,$len,#3 @@ -775,13 +808,14 @@ $code.=<<___; aese $tmp1,q15 aese $tmp2,q15 - mov $cnt,$rounds veor $in0,$in0,$tmp0 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + vst1.8 {$in0},[$out],#16 veor $in1,$in1,$tmp1 + mov $cnt,$rounds + vst1.8 {$in1},[$out],#16 veor $in2,$in2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] - vst1.8 {$in0},[$out],#16 - vst1.8 {$in1},[$out],#16 vst1.8 {$in2},[$out],#16 b.hs .Loop3x_ctr32 @@ -793,40 +827,40 @@ $code.=<<___; .Lctr32_tail: aese $dat0,q8 - aese $dat1,q8 - vld1.32 {q8},[$key_],#16 aesmc $dat0,$dat0 + aese $dat1,q8 aesmc $dat1,$dat1 + vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 - aese $dat1,q9 - vld1.32 {q9},[$key_],#16 aesmc $dat0,$dat0 + aese $dat1,q9 aesmc $dat1,$dat1 + vld1.32 {q9},[$key_],#16 b.gt .Lctr32_tail aese $dat0,q8 - aese $dat1,q8 aesmc $dat0,$dat0 + aese $dat1,q8 aesmc $dat1,$dat1 aese $dat0,q9 - aese $dat1,q9 aesmc $dat0,$dat0 + aese $dat1,q9 aesmc $dat1,$dat1 vld1.8 {$in0},[$inp],$step aese $dat0,q12 - aese $dat1,q12 - vld1.8 {$in1},[$inp] aesmc $dat0,$dat0 + aese $dat1,q12 aesmc $dat1,$dat1 + vld1.8 {$in1},[$inp] aese $dat0,q13 - aese $dat1,q13 aesmc $dat0,$dat0 + aese $dat1,q13 aesmc $dat1,$dat1 - aese $dat0,q14 - aese $dat1,q14 veor $in0,$in0,$rndlast + aese $dat0,q14 aesmc $dat0,$dat0 + aese $dat1,q14 aesmc $dat1,$dat1 veor $in1,$in1,$rndlast aese $dat0,q15 diff --git a/src/crypto/aes/asm/bsaes-armv7.pl b/src/crypto/aes/asm/bsaes-armv7.pl index d70f3ea..a5e4a98 100644 --- a/src/crypto/aes/asm/bsaes-armv7.pl +++ b/src/crypto/aes/asm/bsaes-armv7.pl @@ -47,8 +47,20 @@ # # <ard.biesheuvel@linaro.org> -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); my @XMM=map("q$_",(0..15)); @@ -703,29 +715,35 @@ $code.=<<___; # define BSAES_ASM_EXTENDED_KEY # define XTS_CHAIN_TWEAK # define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 #endif #ifdef __thumb__ # define adrl adr #endif -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + .text .syntax unified @ ARMv7-capable assembler is expected to handle this -#ifdef __thumb2__ +#if defined(__thumb2__) && !defined(__APPLE__) .thumb #else .code 32 #endif -.fpu neon - .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: adr $const,_bsaes_decrypt8 vldmia $key!, {@XMM[9]} @ round 0 key +#ifdef __APPLE__ + adr $const,.LM0ISR +#else add $const,$const,#.LM0ISR-_bsaes_decrypt8 +#endif vldmia $const!, {@XMM[8]} @ .LM0ISR veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key @@ -820,7 +838,11 @@ _bsaes_const: _bsaes_encrypt8: adr $const,_bsaes_encrypt8 vldmia $key!, {@XMM[9]} @ round 0 key +#ifdef __APPLE__ + adr $const,.LM0SR +#else sub $const,$const,#_bsaes_encrypt8-.LM0SR +#endif vldmia $const!, {@XMM[8]} @ .LM0SR _bsaes_encrypt8_alt: @@ -924,7 +946,11 @@ $code.=<<___; _bsaes_key_convert: adr $const,_bsaes_key_convert vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key +#ifdef __APPLE__ + adr $const,.LM0 +#else sub $const,$const,#_bsaes_key_convert-.LM0 +#endif vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key vmov.i8 @XMM[8], #0x01 @ bit masks @@ -1397,7 +1423,12 @@ bsaes_ctr32_encrypt_blocks: vstmia r12, {@XMM[7]} @ save last round key vld1.8 {@XMM[0]}, [$ctr] @ load counter +#ifdef __APPLE__ + mov $ctr, #.LREVM0SR-.LM0 + add $ctr, $const, $ctr +#else add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr +#endif vldmia $keysched, {@XMM[4]} @ load round0 key #else ldr r12, [$key, #244] @@ -1454,7 +1485,12 @@ bsaes_ctr32_encrypt_blocks: vldmia $ctr, {@XMM[8]} @ .LREVM0SR mov r5, $rounds @ pass rounds vstmia $fp, {@XMM[10]} @ save next counter +#ifdef __APPLE__ + mov $const, #.LREVM0SR-.LSR + sub $const, $ctr, $const +#else sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants +#endif bl _bsaes_encrypt8_alt @@ -1555,7 +1591,7 @@ bsaes_ctr32_encrypt_blocks: rev r8, r8 #endif sub sp, sp, #0x10 - vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value + vst1.8 {@XMM[1]}, [sp] @ copy counter value sub sp, sp, #0x10 .Lctr_enc_short_loop: @@ -1566,7 +1602,7 @@ bsaes_ctr32_encrypt_blocks: bl AES_encrypt vld1.8 {@XMM[0]}, [r4]! @ load input - vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter + vld1.8 {@XMM[1]}, [sp] @ load encrypted counter add r8, r8, #1 #ifdef __ARMEL__ rev r0, r8 @@ -2085,9 +2121,11 @@ bsaes_xts_decrypt: vld1.8 {@XMM[8]}, [r0] @ initial tweak adr $magic, .Lxts_magic +#ifndef XTS_CHAIN_TWEAK tst $len, #0xf @ if not multiple of 16 it ne @ Thumb2 thing, sanity check in ARM subne $len, #0x10 @ subtract another 16 bytes +#endif subs $len, #0x80 blo .Lxts_dec_short |