summaryrefslogtreecommitdiffstats
path: root/src/crypto/sha/asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/sha/asm')
-rw-r--r--src/crypto/sha/asm/sha1-586.pl2
-rw-r--r--src/crypto/sha/asm/sha1-armv4-large.pl23
-rw-r--r--src/crypto/sha/asm/sha1-armv8.pl17
-rw-r--r--src/crypto/sha/asm/sha256-armv4.pl101
-rw-r--r--src/crypto/sha/asm/sha512-armv4.pl100
-rw-r--r--src/crypto/sha/asm/sha512-armv8.pl35
6 files changed, 227 insertions, 51 deletions
diff --git a/src/crypto/sha/asm/sha1-586.pl b/src/crypto/sha/asm/sha1-586.pl
index 8377299..4895eb3 100644
--- a/src/crypto/sha/asm/sha1-586.pl
+++ b/src/crypto/sha/asm/sha1-586.pl
@@ -450,7 +450,7 @@ sub sha1msg2 { sha1op38(0xca,@_); }
&sub ("esp",32);
&movdqu ($ABCD,&QWP(0,$ctx));
- &movd ($E,&QWP(16,$ctx));
+ &movd ($E,&DWP(16,$ctx));
&and ("esp",-32);
&movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap
diff --git a/src/crypto/sha/asm/sha1-armv4-large.pl b/src/crypto/sha/asm/sha1-armv4-large.pl
index 1ffa041..a20d336 100644
--- a/src/crypto/sha/asm/sha1-armv4-large.pl
+++ b/src/crypto/sha/asm/sha1-armv4-large.pl
@@ -60,14 +60,28 @@
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
-# byte, which is also >80% faster than integer-only code.
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0";
$inp="r1";
@@ -178,6 +192,9 @@ sha1_block_data_order:
sub r3,pc,#8 @ sha1_block_data_order
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
diff --git a/src/crypto/sha/asm/sha1-armv8.pl b/src/crypto/sha/asm/sha1-armv8.pl
index deb1238..a8c08c2 100644
--- a/src/crypto/sha/asm/sha1-armv8.pl
+++ b/src/crypto/sha/asm/sha1-armv8.pl
@@ -14,13 +14,25 @@
#
# hardware-assisted software(*)
# Apple A7 2.31 4.13 (+14%)
-# Cortex-A53 2.19 8.73 (+108%)
+# Cortex-A53 2.24 8.03 (+97%)
# Cortex-A57 2.35 7.88 (+74%)
+# Denver 2.13 3.97 (+0%)(**)
+# X-Gene 8.80 (+200%)
#
# (*) Software results are presented mostly for reference purposes.
+# (**) Keep in mind that Denver relies on binary translation, which
+# optimizes compiler output at run-time.
$flavour = shift;
-open STDOUT,">".shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
($ctx,$inp,$num)=("x0","x1","x2");
@Xw=map("w$_",(3..17,19));
@@ -154,6 +166,7 @@ $code.=<<___;
.text
+.extern OPENSSL_armcap_P
.globl sha1_block_data_order
.type sha1_block_data_order,%function
.align 6
diff --git a/src/crypto/sha/asm/sha256-armv4.pl b/src/crypto/sha/asm/sha256-armv4.pl
index 398376e..778c3d9 100644
--- a/src/crypto/sha/asm/sha256-armv4.pl
+++ b/src/crypto/sha/asm/sha256-armv4.pl
@@ -5,6 +5,8 @@
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
@@ -35,8 +37,20 @@
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
@@ -71,7 +85,9 @@ $code.=<<___ if ($i<16);
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
rev $t1,$t1
+# endif
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
@@ -151,10 +167,25 @@ ___
}
$code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
.text
+#if __ARM_ARCH__<7
.code 32
+#else
+.syntax unified
+# if defined(__thumb2__) && !defined(__APPLE__)
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
.type K256,%object
.align 5
@@ -177,25 +208,33 @@ K256:
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha256_block_data_order
+.word OPENSSL_armcap_P-.Lsha256_block_data_order
#endif
.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256+32 @ K256
@@ -213,6 +252,9 @@ for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
@@ -429,16 +471,19 @@ $code.=<<___;
.arch armv7-a
.fpu neon
+.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 4
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
+ sub $H,sp,#16*4+16
+ adr $Ktbl,K256
+ bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
- sub sp,sp,#16*4+16 @ alloca
- sub $Ktbl,r3,#256+32 @ K256
- bic sp,sp,#15 @ align for 128-bit stores
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
vld1.8 {@X[0]},[$inp]!
vld1.8 {@X[1]},[$inp]!
@@ -490,11 +535,13 @@ $code.=<<___;
ldr $t0,[sp,#72]
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
teq $inp,$t0
+ it eq
subeq $inp,$inp,#64 @ avoid SEGV
vld1.8 {@X[0]},[$inp]! @ load next input block
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
+ it ne
strne $inp,[sp,#68]
mov $Xfer,sp
___
@@ -526,10 +573,12 @@ $code.=<<___;
str $D,[$t1],#4
stmia $t1,{$E-$H}
+ ittte ne
movne $Xfer,sp
ldrne $t1,[sp,#0]
eorne $t2,$t2,$t2
ldreq sp,[sp,#76] @ restore original sp
+ itt ne
eorne $t3,$B,$C
bne .L_00_48
@@ -548,13 +597,28 @@ my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__) && !defined(__APPLE__)
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
- sub $Ktbl,r3,#sha256_block_data_order-K256
+# ifdef __APPLE__
+ sub $Ktbl,$Ktbl,#256+32
+# elif defined(__thumb2__)
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
@@ -607,6 +671,7 @@ $code.=<<___;
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
bne .Loop_v8
vst1.32 {$ABCD,$EFGH},[$ctx]
@@ -619,12 +684,20 @@ ___
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif
___
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
{ my %opcode = (
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
@@ -639,7 +712,7 @@ ___
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
diff --git a/src/crypto/sha/asm/sha512-armv4.pl b/src/crypto/sha/asm/sha512-armv4.pl
index bfe28c4..2964a39 100644
--- a/src/crypto/sha/asm/sha512-armv4.pl
+++ b/src/crypto/sha/asm/sha512-armv4.pl
@@ -5,6 +5,8 @@
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
@@ -34,16 +36,9 @@
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, and for NEON-only sequences IPC(*) was found to
-# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
-# ILPs(*) of 1 and 2 respectively. This in turn means that you can
-# even find yourself striving, as I did here, for achieving IPC
-# adequate to one delivered by Cortex A8 [for reference, it's
-# 0.5 for ILP of 1, and 1 for higher ILPs].
-#
-# (*) ILP, instruction-level parallelism, how many instructions
-# *can* execute at the same time. IPC, instructions per cycle,
-# indicates how many instructions actually execute.
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
# Byte order [in]dependence. =========================================
#
@@ -55,8 +50,20 @@ $hi="HI";
$lo="LO";
# ====================================================================
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; # parameter block
$inp="r1";
@@ -143,6 +150,9 @@ $code.=<<___;
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
+#if __ARM_ARCH__>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@@ -180,7 +190,17 @@ $code.=<<___;
___
}
$code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
#ifdef __ARMEL__
# define LO 0
# define HI 4
@@ -192,7 +212,18 @@ $code=<<___;
#endif
.text
+#if __ARM_ARCH__<7 || defined(__APPLE__)
.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
.type K512,%object
.align 5
K512:
@@ -237,9 +268,9 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
+.word OPENSSL_armcap_P-.Lsha512_block_data_order
.skip 32-4
#else
.skip 32
@@ -248,14 +279,22 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha512_block_data_order
- add $len,$inp,$len,lsl#7 @ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+ adr r3,sha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#1
bne .LNEON
#endif
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
@@ -369,6 +408,9 @@ $code.=<<___;
___
&BODY_00_15(0x17);
$code.=<<___;
+#if __ARM_ARCH__>=7
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
@@ -453,6 +495,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
+.size sha512_block_data_order,.-sha512_block_data_order
___
{
@@ -559,11 +602,15 @@ $code.=<<___;
.arch armv7-a
.fpu neon
+.global sha512_block_data_order_neon
+.type sha512_block_data_order_neon,%function
.align 4
+sha512_block_data_order_neon:
.LNEON:
dmb @ errata #451034 on early Cortex A8
- vstmdb sp!,{d8-d15} @ ABI specification says so
- sub $Ktbl,r3,#672 @ K512
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+ adr $Ktbl,K512
+ VFP_ABI_PUSH
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
@@ -588,16 +635,16 @@ $code.=<<___;
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
- vldmia sp!,{d8-d15} @ epilogue
+ VFP_ABI_POP
ret @ bx lr
+.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
#endif
___
}
$code.=<<___;
-.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif
@@ -606,5 +653,14 @@ ___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
print $code;
close STDOUT; # enforce flush
diff --git a/src/crypto/sha/asm/sha512-armv8.pl b/src/crypto/sha/asm/sha512-armv8.pl
index 5a9c812..43e7293 100644
--- a/src/crypto/sha/asm/sha512-armv8.pl
+++ b/src/crypto/sha/asm/sha512-armv8.pl
@@ -14,8 +14,10 @@
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-# Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***))
+# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+# Denver 2.01 10.5 (+26%) 6.70 (+8%)
+# X-Gene 20.0 (+100%) 12.8 (+300%(***))
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
@@ -25,12 +27,24 @@
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significanty faster
-# and lags behind assembly only by 50-90%.
+# and the gap is only 40-90%.
$flavour=shift;
+# Unlike most perlasm files, sha512-armv8.pl takes an additional argument to
+# determine which hash function to emit. This differs from upstream OpenSSL so
+# that the script may continue to output to stdout.
+$variant=shift;
$output=shift;
-if ($output =~ /512/) {
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($variant eq "sha512") {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@@ -39,7 +53,7 @@ if ($output =~ /512/) {
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
-} else {
+} elsif ($variant eq "sha256") {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@@ -48,6 +62,8 @@ if ($output =~ /512/) {
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
+} else {
+ die "Unknown variant: $variant";
}
$func="sha${BITS}_block_data_order";
@@ -152,6 +168,7 @@ $code.=<<___;
.text
+.extern OPENSSL_armcap_P
.globl $func
.type $func,%function
.align 6
@@ -181,7 +198,7 @@ $code.=<<___;
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
- adr $Ktbl,K$BITS
+ adr $Ktbl,.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
@@ -231,8 +248,8 @@ $code.=<<___;
.size $func,.-$func
.align 6
-.type K$BITS,%object
-K$BITS:
+.type .LK$BITS,%object
+.LK$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
@@ -297,7 +314,7 @@ $code.=<<___ if ($SZ==4);
.long 0 //terminator
___
$code.=<<___;
-.size K$BITS,.-K$BITS
+.size .LK$BITS,.-.LK$BITS
.align 3
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
@@ -322,7 +339,7 @@ sha256_block_armv8:
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
- adr $Ktbl,K256
+ adr $Ktbl,.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64