diff options
Diffstat (limited to 'src/crypto/sha')
-rw-r--r-- | src/crypto/sha/asm/sha1-586.pl | 2 | ||||
-rw-r--r-- | src/crypto/sha/asm/sha1-armv4-large.pl | 23 | ||||
-rw-r--r-- | src/crypto/sha/asm/sha1-armv8.pl | 17 | ||||
-rw-r--r-- | src/crypto/sha/asm/sha256-armv4.pl | 101 | ||||
-rw-r--r-- | src/crypto/sha/asm/sha512-armv4.pl | 100 | ||||
-rw-r--r-- | src/crypto/sha/asm/sha512-armv8.pl | 35 | ||||
-rw-r--r-- | src/crypto/sha/sha1.c | 3 | ||||
-rw-r--r-- | src/crypto/sha/sha512.c | 24 |
8 files changed, 245 insertions, 60 deletions
diff --git a/src/crypto/sha/asm/sha1-586.pl b/src/crypto/sha/asm/sha1-586.pl index 8377299..4895eb3 100644 --- a/src/crypto/sha/asm/sha1-586.pl +++ b/src/crypto/sha/asm/sha1-586.pl @@ -450,7 +450,7 @@ sub sha1msg2 { sha1op38(0xca,@_); } &sub ("esp",32); &movdqu ($ABCD,&QWP(0,$ctx)); - &movd ($E,&QWP(16,$ctx)); + &movd ($E,&DWP(16,$ctx)); &and ("esp",-32); &movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap diff --git a/src/crypto/sha/asm/sha1-armv4-large.pl b/src/crypto/sha/asm/sha1-armv4-large.pl index 1ffa041..a20d336 100644 --- a/src/crypto/sha/asm/sha1-armv4-large.pl +++ b/src/crypto/sha/asm/sha1-armv4-large.pl @@ -60,14 +60,28 @@ # is ~2.5x larger and there are some redundant instructions executed # when processing last block, improvement is not as big for smallest # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per -# byte, which is also >80% faster than integer-only code. +# byte, which is also >80% faster than integer-only code. Cortex-A15 +# is even faster spending 5.6 cycles per byte outperforming integer- +# only code by factor of 2. # May 2014. # # Add ARMv8 code path performing at 2.35 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $inp="r1"; @@ -178,6 +192,9 @@ sha1_block_data_order: sub r3,pc,#8 @ sha1_block_data_order ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA1 bne .LARMv8 tst r12,#ARMV7_NEON diff --git a/src/crypto/sha/asm/sha1-armv8.pl b/src/crypto/sha/asm/sha1-armv8.pl index deb1238..a8c08c2 100644 --- a/src/crypto/sha/asm/sha1-armv8.pl +++ b/src/crypto/sha/asm/sha1-armv8.pl @@ -14,13 +14,25 @@ # # hardware-assisted software(*) # Apple A7 2.31 4.13 (+14%) -# Cortex-A53 2.19 8.73 (+108%) +# Cortex-A53 2.24 8.03 (+97%) # Cortex-A57 2.35 7.88 (+74%) +# Denver 2.13 3.97 (+0%)(**) +# X-Gene 8.80 (+200%) # # (*) Software results are presented mostly for reference purposes. +# (**) Keep in mind that Denver relies on binary translation, which +# optimizes compiler output at run-time. $flavour = shift; -open STDOUT,">".shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; ($ctx,$inp,$num)=("x0","x1","x2"); @Xw=map("w$_",(3..17,19)); @@ -154,6 +166,7 @@ $code.=<<___; .text +.extern OPENSSL_armcap_P .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 diff --git a/src/crypto/sha/asm/sha256-armv4.pl b/src/crypto/sha/asm/sha256-armv4.pl index 398376e..778c3d9 100644 --- a/src/crypto/sha/asm/sha256-armv4.pl +++ b/src/crypto/sha/asm/sha256-armv4.pl @@ -5,6 +5,8 @@ # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPL terms is granted. # ==================================================================== # SHA256 block procedure for ARMv4. May 2007. @@ -35,8 +37,20 @@ # # Add ARMv8 code path performing at 2.0 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $t0="r0"; $inp="r1"; $t4="r1"; @@ -71,7 +85,9 @@ $code.=<<___ if ($i<16); eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` add $a,$a,$t2 @ h+=Maj(a,b,c) from the past eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ rev $t1,$t1 +# endif #else @ ldrb $t1,[$inp,#3] @ $i add $a,$a,$t2 @ h+=Maj(a,b,c) from the past @@ -151,10 +167,25 @@ ___ } $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif .text +#if __ARM_ARCH__<7 .code 32 +#else +.syntax unified +# if defined(__thumb2__) && !defined(__APPLE__) +# define adrl adr +.thumb +# else +.code 32 +# endif +#endif .type K256,%object .align 5 @@ -177,25 +208,33 @@ K256: .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_block_data_order +.word OPENSSL_armcap_P-.Lsha256_block_data_order #endif .align 5 .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: +.Lsha256_block_data_order: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ sha256_block_data_order - add $len,$inp,$len,lsl#6 @ len to point at the end of inp -#if __ARM_MAX_ARCH__>=7 +#else + adr r3,sha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA256 bne .LARMv8 tst r12,#ARMV7_NEON bne .LNEON #endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} sub $Ktbl,r3,#256+32 @ K256 @@ -213,6 +252,9 @@ for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } $code.=".Lrounds_16_xx:\n"; for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } $code.=<<___; +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif ldreq $t3,[sp,#16*4] @ pull ctx bne .Lrounds_16_xx @@ -429,16 +471,19 @@ $code.=<<___; .arch armv7-a .fpu neon +.global sha256_block_data_order_neon .type sha256_block_data_order_neon,%function .align 4 sha256_block_data_order_neon: .LNEON: stmdb sp!,{r4-r12,lr} + sub $H,sp,#16*4+16 + adr $Ktbl,K256 + bic $H,$H,#15 @ align for 128-bit stores mov $t2,sp - sub sp,sp,#16*4+16 @ alloca - sub $Ktbl,r3,#256+32 @ K256 - bic sp,sp,#15 @ align for 128-bit stores + mov sp,$H @ alloca + add $len,$inp,$len,lsl#6 @ len to point at the end of inp vld1.8 {@X[0]},[$inp]! vld1.8 {@X[1]},[$inp]! @@ -490,11 +535,13 @@ $code.=<<___; ldr $t0,[sp,#72] sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl teq $inp,$t0 + it eq subeq $inp,$inp,#64 @ avoid SEGV vld1.8 {@X[0]},[$inp]! @ load next input block vld1.8 {@X[1]},[$inp]! vld1.8 {@X[2]},[$inp]! vld1.8 {@X[3]},[$inp]! + it ne strne $inp,[sp,#68] mov $Xfer,sp ___ @@ -526,10 +573,12 @@ $code.=<<___; str $D,[$t1],#4 stmia $t1,{$E-$H} + ittte ne movne $Xfer,sp ldrne $t1,[sp,#0] eorne $t2,$t2,$t2 ldreq sp,[sp,#76] @ restore original sp + itt ne eorne $t3,$B,$C bne .L_00_48 @@ -548,13 +597,28 @@ my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); my $Ktbl="r3"; $code.=<<___; -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# if defined(__thumb2__) && !defined(__APPLE__) +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + .type sha256_block_data_order_armv8,%function .align 5 sha256_block_data_order_armv8: .LARMv8: vld1.32 {$ABCD,$EFGH},[$ctx] - sub $Ktbl,r3,#sha256_block_data_order-K256 +# ifdef __APPLE__ + sub $Ktbl,$Ktbl,#256+32 +# elif defined(__thumb2__) + adr $Ktbl,.LARMv8 + sub $Ktbl,$Ktbl,#.LARMv8-K256 +# else + adrl $Ktbl,K256 +# endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp .Loop_v8: vld1.8 {@MSG[0]-@MSG[1]},[$inp]! @@ -607,6 +671,7 @@ $code.=<<___; vadd.i32 $ABCD,$ABCD,$ABCD_SAVE vadd.i32 $EFGH,$EFGH,$EFGH_SAVE + it ne bne .Loop_v8 vst1.32 {$ABCD,$EFGH},[$ctx] @@ -619,12 +684,20 @@ ___ $code.=<<___; .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif ___ +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + { my %opcode = ( "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); @@ -639,7 +712,7 @@ ___ # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; diff --git a/src/crypto/sha/asm/sha512-armv4.pl b/src/crypto/sha/asm/sha512-armv4.pl index bfe28c4..2964a39 100644 --- a/src/crypto/sha/asm/sha512-armv4.pl +++ b/src/crypto/sha/asm/sha512-armv4.pl @@ -5,6 +5,8 @@ # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPL terms is granted. # ==================================================================== # SHA512 block procedure for ARMv4. September 2007. @@ -34,16 +36,9 @@ # terms it's 22.6 cycles per byte, which is disappointing result. # Technical writers asserted that 3-way S4 pipeline can sustain # multiple NEON instructions per cycle, but dual NEON issue could -# not be observed, and for NEON-only sequences IPC(*) was found to -# be limited by 1:-( 0.33 and 0.66 were measured for sequences with -# ILPs(*) of 1 and 2 respectively. This in turn means that you can -# even find yourself striving, as I did here, for achieving IPC -# adequate to one delivered by Cortex A8 [for reference, it's -# 0.5 for ILP of 1, and 1 for higher ILPs]. -# -# (*) ILP, instruction-level parallelism, how many instructions -# *can* execute at the same time. IPC, instructions per cycle, -# indicates how many instructions actually execute. +# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. # Byte order [in]dependence. ========================================= # @@ -55,8 +50,20 @@ $hi="HI"; $lo="LO"; # ==================================================================== -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; # parameter block $inp="r1"; @@ -143,6 +150,9 @@ $code.=<<___; teq $t0,#$magic ldr $t3,[sp,#$Coff+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif orreq $Ktbl,$Ktbl,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @@ -180,7 +190,17 @@ $code.=<<___; ___ } $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + #ifdef __ARMEL__ # define LO 0 # define HI 4 @@ -192,7 +212,18 @@ $code=<<___; #endif .text +#if __ARM_ARCH__<7 || defined(__APPLE__) .code 32 +#else +.syntax unified +# ifdef __thumb2__ +# define adrl adr +.thumb +# else +.code 32 +# endif +#endif + .type K512,%object .align 5 K512: @@ -237,9 +268,9 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order +.word OPENSSL_armcap_P-.Lsha512_block_data_order .skip 32-4 #else .skip 32 @@ -248,14 +279,22 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ sha512_block_data_order - add $len,$inp,$len,lsl#7 @ len to point at the end of inp -#if __ARM_MAX_ARCH__>=7 +#else + adr r3,sha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#1 bne .LNEON #endif + add $len,$inp,$len,lsl#7 @ len to point at the end of inp stmdb sp!,{r4-r12,lr} sub $Ktbl,r3,#672 @ K512 sub sp,sp,#9*8 @@ -369,6 +408,9 @@ $code.=<<___; ___ &BODY_00_15(0x17); $code.=<<___; +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] beq .L16_79 @@ -453,6 +495,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif +.size sha512_block_data_order,.-sha512_block_data_order ___ { @@ -559,11 +602,15 @@ $code.=<<___; .arch armv7-a .fpu neon +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function .align 4 +sha512_block_data_order_neon: .LNEON: dmb @ errata #451034 on early Cortex A8 - vstmdb sp!,{d8-d15} @ ABI specification says so - sub $Ktbl,r3,#672 @ K512 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + adr $Ktbl,K512 + VFP_ABI_PUSH vldmia $ctx,{$A-$H} @ load context .Loop_neon: ___ @@ -588,16 +635,16 @@ $code.=<<___; sub $Ktbl,#640 @ rewind K512 bne .Loop_neon - vldmia sp!,{d8-d15} @ epilogue + VFP_ABI_POP ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon #endif ___ } $code.=<<___; -.size sha512_block_data_order,.-sha512_block_data_order .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif @@ -606,5 +653,14 @@ ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 $code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT; # enforce flush diff --git a/src/crypto/sha/asm/sha512-armv8.pl b/src/crypto/sha/asm/sha512-armv8.pl index 5a9c812..43e7293 100644 --- a/src/crypto/sha/asm/sha512-armv8.pl +++ b/src/crypto/sha/asm/sha512-armv8.pl @@ -14,8 +14,10 @@ # # SHA256-hw SHA256(*) SHA512 # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -# Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***)) +# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# Denver 2.01 10.5 (+26%) 6.70 (+8%) +# X-Gene 20.0 (+100%) 12.8 (+300%(***)) # # (*) Software SHA256 results are of lesser relevance, presented # mostly for informational purposes. @@ -25,12 +27,24 @@ # (***) Super-impressive coefficients over gcc-generated code are # indication of some compiler "pathology", most notably code # generated with -mgeneral-regs-only is significanty faster -# and lags behind assembly only by 50-90%. +# and the gap is only 40-90%. $flavour=shift; +# Unlike most perlasm files, sha512-armv8.pl takes an additional argument to +# determine which hash function to emit. This differs from upstream OpenSSL so +# that the script may continue to output to stdout. +$variant=shift; $output=shift; -if ($output =~ /512/) { +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if ($variant eq "sha512") { $BITS=512; $SZ=8; @Sigma0=(28,34,39); @@ -39,7 +53,7 @@ if ($output =~ /512/) { @sigma1=(19,61, 6); $rounds=80; $reg_t="x"; -} else { +} elsif ($variant eq "sha256") { $BITS=256; $SZ=4; @Sigma0=( 2,13,22); @@ -48,6 +62,8 @@ if ($output =~ /512/) { @sigma1=(17,19,10); $rounds=64; $reg_t="w"; +} else { + die "Unknown variant: $variant"; } $func="sha${BITS}_block_data_order"; @@ -152,6 +168,7 @@ $code.=<<___; .text +.extern OPENSSL_armcap_P .globl $func .type $func,%function .align 6 @@ -181,7 +198,7 @@ $code.=<<___; ldp $E,$F,[$ctx,#4*$SZ] add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input ldp $G,$H,[$ctx,#6*$SZ] - adr $Ktbl,K$BITS + adr $Ktbl,.LK$BITS stp $ctx,$num,[x29,#96] .Loop: @@ -231,8 +248,8 @@ $code.=<<___; .size $func,.-$func .align 6 -.type K$BITS,%object -K$BITS: +.type .LK$BITS,%object +.LK$BITS: ___ $code.=<<___ if ($SZ==8); .quad 0x428a2f98d728ae22,0x7137449123ef65cd @@ -297,7 +314,7 @@ $code.=<<___ if ($SZ==4); .long 0 //terminator ___ $code.=<<___; -.size K$BITS,.-K$BITS +.size .LK$BITS,.-.LK$BITS .align 3 .LOPENSSL_armcap_P: .quad OPENSSL_armcap_P-. @@ -322,7 +339,7 @@ sha256_block_armv8: add x29,sp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,K256 + adr $Ktbl,.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 diff --git a/src/crypto/sha/sha1.c b/src/crypto/sha/sha1.c index 7595bc8..60d09f6 100644 --- a/src/crypto/sha/sha1.c +++ b/src/crypto/sha/sha1.c @@ -367,8 +367,9 @@ static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num) { c->h3 = (c->h3 + B) & 0xffffffffL; c->h4 = (c->h4 + C) & 0xffffffffL; - if (--num == 0) + if (--num == 0) { break; + } A = c->h0; B = c->h1; diff --git a/src/crypto/sha/sha512.c b/src/crypto/sha/sha512.c index 59be8c1..2acefb1 100644 --- a/src/crypto/sha/sha512.c +++ b/src/crypto/sha/sha512.c @@ -189,8 +189,9 @@ int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) { uint8_t *p = c->u.p; const uint8_t *data = (const uint8_t *)in_data; - if (len == 0) + if (len == 0) { return 1; + } l = (c->Nl + (((uint64_t)len) << 3)) & OPENSSL_U64(0xffffffffffffffff); if (l < c->Nl) { @@ -218,14 +219,21 @@ int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) { if (len >= sizeof(c->u)) { #ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA - if ((size_t)data % sizeof(c->u.d[0]) != 0) - while (len >= sizeof(c->u)) - memcpy(p, data, sizeof(c->u)), sha512_block_data_order(c, p, 1), - len -= sizeof(c->u), data += sizeof(c->u); - else + if ((size_t)data % sizeof(c->u.d[0]) != 0) { + while (len >= sizeof(c->u)) { + memcpy(p, data, sizeof(c->u)); + sha512_block_data_order(c, p, 1); + len -= sizeof(c->u); + data += sizeof(c->u); + } + } else #endif - sha512_block_data_order(c, data, len / sizeof(c->u)), data += len, - len %= sizeof(c->u), data -= len; + { + sha512_block_data_order(c, data, len / sizeof(c->u)); + data += len; + len %= sizeof(c->u); + data -= len; + } } if (len != 0) { |