6 files changed, 227 insertions, 51 deletions
diff --git a/src/crypto/sha/asm/sha1-586.pl b/src/crypto/sha/asm/sha1-586.pl
index 8377299..4895eb3 100644
--- a/src/crypto/sha/asm/sha1-586.pl
+++ b/src/crypto/sha/asm/sha1-586.pl
@@ -450,7 +450,7 @@ sub sha1msg2	{ sha1op38(0xca,@_); }
 	&sub	("esp",32);
 
 	&movdqu	($ABCD,&QWP(0,$ctx));
-	&movd	($E,&QWP(16,$ctx));
+	&movd	($E,&DWP(16,$ctx));
 	&and	("esp",-32);
 	&movdqa	($BSWAP,&QWP(0x50,$tmp1));	# byte-n-word swap
 
diff --git a/src/crypto/sha/asm/sha1-armv4-large.pl b/src/crypto/sha/asm/sha1-armv4-large.pl
index 1ffa041..a20d336 100644
--- a/src/crypto/sha/asm/sha1-armv4-large.pl
+++ b/src/crypto/sha/asm/sha1-armv4-large.pl
@@ -60,14 +60,28 @@
 # is ~2.5x larger and there are some redundant instructions executed
 # when processing last block, improvement is not as big for smallest
 # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
-# byte, which is also >80% faster than integer-only code.
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
 
 # May 2014.
 #
 # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $ctx="r0";
 $inp="r1";
@@ -178,6 +192,9 @@ sha1_block_data_order:
 	sub	r3,pc,#8		@ sha1_block_data_order
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#ARMV8_SHA1
 	bne	.LARMv8
 	tst	r12,#ARMV7_NEON
diff --git a/src/crypto/sha/asm/sha1-armv8.pl b/src/crypto/sha/asm/sha1-armv8.pl
index deb1238..a8c08c2 100644
--- a/src/crypto/sha/asm/sha1-armv8.pl
+++ b/src/crypto/sha/asm/sha1-armv8.pl
@@ -14,13 +14,25 @@
 #
 #		hardware-assisted	software(*)
 # Apple A7	2.31			4.13 (+14%)
-# Cortex-A53	2.19			8.73 (+108%)
+# Cortex-A53	2.24			8.03 (+97%)
 # Cortex-A57	2.35			7.88 (+74%)
+# Denver	2.13			3.97 (+0%)(**)
+# X-Gene				8.80 (+200%)
 #
 # (*)	Software results are presented mostly for reference purposes.
+# (**)	Keep in mind that Denver relies on binary translation, which
+#	optimizes compiler output at run-time.
 
 $flavour = shift;
-open STDOUT,">".shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 ($ctx,$inp,$num)=("x0","x1","x2");
 @Xw=map("w$_",(3..17,19));
@@ -154,6 +166,7 @@ $code.=<<___;
 
 .text
 
+.extern	OPENSSL_armcap_P
 .globl	sha1_block_data_order
 .type	sha1_block_data_order,%function
 .align	6
diff --git a/src/crypto/sha/asm/sha256-armv4.pl b/src/crypto/sha/asm/sha256-armv4.pl
index 398376e..778c3d9 100644
--- a/src/crypto/sha/asm/sha256-armv4.pl
+++ b/src/crypto/sha/asm/sha256-armv4.pl
@@ -5,6 +5,8 @@
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
 # ====================================================================
 
 # SHA256 block procedure for ARMv4. May 2007.
@@ -35,8 +37,20 @@
 #
 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $ctx="r0";	$t0="r0";
 $inp="r1";	$t4="r1";
@@ -71,7 +85,9 @@ $code.=<<___ if ($i<16);
 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
 	rev	$t1,$t1
+# endif
 #else
 	@ ldrb	$t1,[$inp,#3]			@ $i
 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
@@ -151,10 +167,25 @@ ___
 }
 
 $code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
 
 .text
+#if __ARM_ARCH__<7
 .code	32
+#else
+.syntax unified
+# if defined(__thumb2__) && !defined(__APPLE__)
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
 
 .type	K256,%object
 .align	5
@@ -177,25 +208,33 @@ K256:
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha256_block_data_order
+.word	OPENSSL_armcap_P-.Lsha256_block_data_order
 #endif
 .align	5
 
 .global	sha256_block_data_order
 .type	sha256_block_data_order,%function
 sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
 	sub	r3,pc,#8		@ sha256_block_data_order
-	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#ARMV8_SHA256
 	bne	.LARMv8
 	tst	r12,#ARMV7_NEON
 	bne	.LNEON
 #endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 	sub	$Ktbl,r3,#256+32	@ K256
@@ -213,6 +252,9 @@ for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 $code.=".Lrounds_16_xx:\n";
 for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
 	ldreq	$t3,[sp,#16*4]		@ pull ctx
 	bne	.Lrounds_16_xx
 
@@ -429,16 +471,19 @@ $code.=<<___;
 .arch	armv7-a
 .fpu	neon
 
+.global	sha256_block_data_order_neon
 .type	sha256_block_data_order_neon,%function
 .align	4
 sha256_block_data_order_neon:
 .LNEON:
 	stmdb	sp!,{r4-r12,lr}
 
+	sub	$H,sp,#16*4+16
+	adr	$Ktbl,K256
+	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
-	sub	sp,sp,#16*4+16		@ alloca
-	sub	$Ktbl,r3,#256+32	@ K256
-	bic	sp,sp,#15		@ align for 128-bit stores
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 
 	vld1.8		{@X[0]},[$inp]!
 	vld1.8		{@X[1]},[$inp]!
@@ -490,11 +535,13 @@ $code.=<<___;
 	ldr		$t0,[sp,#72]
 	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
 	teq		$inp,$t0
+	it		eq
 	subeq		$inp,$inp,#64		@ avoid SEGV
 	vld1.8		{@X[0]},[$inp]!		@ load next input block
 	vld1.8		{@X[1]},[$inp]!
 	vld1.8		{@X[2]},[$inp]!
 	vld1.8		{@X[3]},[$inp]!
+	it		ne
 	strne		$inp,[sp,#68]
 	mov		$Xfer,sp
 ___
@@ -526,10 +573,12 @@ $code.=<<___;
 	str	$D,[$t1],#4
 	stmia	$t1,{$E-$H}
 
+	ittte	ne
 	movne	$Xfer,sp
 	ldrne	$t1,[sp,#0]
 	eorne	$t2,$t2,$t2
 	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
 	eorne	$t3,$B,$C
 	bne	.L_00_48
 
@@ -548,13 +597,28 @@ my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
 my $Ktbl="r3";
 
 $code.=<<___;
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__) && !defined(__APPLE__)
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
 .type	sha256_block_data_order_armv8,%function
 .align	5
 sha256_block_data_order_armv8:
 .LARMv8:
 	vld1.32	{$ABCD,$EFGH},[$ctx]
-	sub	$Ktbl,r3,#sha256_block_data_order-K256
+# ifdef	__APPLE__
+	sub	$Ktbl,$Ktbl,#256+32
+# elif	defined(__thumb2__)
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 
 .Loop_v8:
 	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
@@ -607,6 +671,7 @@ $code.=<<___;
 
 	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
 	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
 	bne		.Loop_v8
 
 	vst1.32		{$ABCD,$EFGH},[$ctx]
@@ -619,12 +684,20 @@ ___
 $code.=<<___;
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .comm   OPENSSL_armcap_P,4,4
 .hidden OPENSSL_armcap_P
 #endif
 ___
 
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
 {   my  %opcode = (
 	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
 	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
@@ -639,7 +712,7 @@ ___
 	    # since ARMv7 instructions are always encoded little-endian.
 	    # correct solution is to use .inst directive, but older
 	    # assemblers don't implement it:-(
-	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
 			$word&0xff,($word>>8)&0xff,
 			($word>>16)&0xff,($word>>24)&0xff,
 			$mnemonic,$arg;
diff --git a/src/crypto/sha/asm/sha512-armv4.pl b/src/crypto/sha/asm/sha512-armv4.pl
index bfe28c4..2964a39 100644
--- a/src/crypto/sha/asm/sha512-armv4.pl
+++ b/src/crypto/sha/asm/sha512-armv4.pl
@@ -5,6 +5,8 @@
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
 # ====================================================================
 
 # SHA512 block procedure for ARMv4. September 2007.
@@ -34,16 +36,9 @@
 # terms it's 22.6 cycles per byte, which is disappointing result.
 # Technical writers asserted that 3-way S4 pipeline can sustain
 # multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, and for NEON-only sequences IPC(*) was found to
-# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
-# ILPs(*) of 1 and 2 respectively. This in turn means that you can
-# even find yourself striving, as I did here, for achieving IPC
-# adequate to one delivered by Cortex A8 [for reference, it's
-# 0.5 for ILP of 1, and 1 for higher ILPs].
-#
-# (*) ILP, instruction-level parallelism, how many instructions
-#     *can* execute at the same time. IPC, instructions per cycle,
-#     indicates how many instructions actually execute.
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
 
 # Byte order [in]dependence. =========================================
 #
@@ -55,8 +50,20 @@ $hi="HI";
 $lo="LO";
 # ====================================================================
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $ctx="r0";	# parameter block
 $inp="r1";
@@ -143,6 +150,9 @@ $code.=<<___;
 	teq	$t0,#$magic
 
 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
 	orreq	$Ktbl,$Ktbl,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@@ -180,7 +190,17 @@ $code.=<<___;
 ___
 }
 $code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
 #ifdef __ARMEL__
 # define LO 0
 # define HI 4
@@ -192,7 +212,18 @@ $code=<<___;
 #endif
 
 .text
+#if __ARM_ARCH__<7 || defined(__APPLE__)
 .code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
 .type	K512,%object
 .align	5
 K512:
@@ -237,9 +268,9 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha512_block_data_order
+.word	OPENSSL_armcap_P-.Lsha512_block_data_order
 .skip	32-4
 #else
 .skip	32
@@ -248,14 +279,22 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
 	sub	r3,pc,#8		@ sha512_block_data_order
-	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+	adr	r3,sha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#1
 	bne	.LNEON
 #endif
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	stmdb	sp!,{r4-r12,lr}
 	sub	$Ktbl,r3,#672		@ K512
 	sub	sp,sp,#9*8
@@ -369,6 +408,9 @@ $code.=<<___;
 ___
 	&BODY_00_15(0x17);
 $code.=<<___;
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
 	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
 	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
 	beq	.L16_79
@@ -453,6 +495,7 @@ $code.=<<___;
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
+.size	sha512_block_data_order,.-sha512_block_data_order
 ___
 
 {
@@ -559,11 +602,15 @@ $code.=<<___;
 .arch	armv7-a
 .fpu	neon
 
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
 .align	4
+sha512_block_data_order_neon:
 .LNEON:
 	dmb				@ errata #451034 on early Cortex A8
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-	sub	$Ktbl,r3,#672		@ K512
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	adr	$Ktbl,K512
+	VFP_ABI_PUSH
 	vldmia	$ctx,{$A-$H}		@ load context
 .Loop_neon:
 ___
@@ -588,16 +635,16 @@ $code.=<<___;
 	sub		$Ktbl,#640	@ rewind K512
 	bne		.Loop_neon
 
-	vldmia	sp!,{d8-d15}		@ epilogue
+	VFP_ABI_POP
 	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
 #endif
 ___
 }
 $code.=<<___;
-.size	sha512_block_data_order,.-sha512_block_data_order
 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .comm	OPENSSL_armcap_P,4,4
 .hidden	OPENSSL_armcap_P
 #endif
@@ -606,5 +653,14 @@ ___
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
 $code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
 print $code;
 close STDOUT; # enforce flush
diff --git a/src/crypto/sha/asm/sha512-armv8.pl b/src/crypto/sha/asm/sha512-armv8.pl
index 5a9c812..43e7293 100644
--- a/src/crypto/sha/asm/sha512-armv8.pl
+++ b/src/crypto/sha/asm/sha512-armv8.pl
@@ -14,8 +14,10 @@
 #
 #		SHA256-hw	SHA256(*)	SHA512
 # Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-# Cortex-A53	2.38		15.6 (+110%)	10.1 (+190%(***))
+# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
 # Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# Denver	2.01		10.5 (+26%)	6.70 (+8%)
+# X-Gene			20.0 (+100%)	12.8 (+300%(***))
 # 
 # (*)	Software SHA256 results are of lesser relevance, presented
 #	mostly for informational purposes.
@@ -25,12 +27,24 @@
 # (***)	Super-impressive coefficients over gcc-generated code are
 #	indication of some compiler "pathology", most notably code
 #	generated with -mgeneral-regs-only is significanty faster
-#	and lags behind assembly only by 50-90%.
+#	and the gap is only 40-90%.
 
 $flavour=shift;
+# Unlike most perlasm files, sha512-armv8.pl takes an additional argument to
+# determine which hash function to emit. This differs from upstream OpenSSL so
+# that the script may continue to output to stdout.
+$variant=shift;
 $output=shift;
 
-if ($output =~ /512/) {
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($variant eq "sha512") {
 	$BITS=512;
 	$SZ=8;
 	@Sigma0=(28,34,39);
@@ -39,7 +53,7 @@ if ($output =~ /512/) {
 	@sigma1=(19,61, 6);
 	$rounds=80;
 	$reg_t="x";
-} else {
+} elsif ($variant eq "sha256") {
 	$BITS=256;
 	$SZ=4;
 	@Sigma0=( 2,13,22);
@@ -48,6 +62,8 @@ if ($output =~ /512/) {
 	@sigma1=(17,19,10);
 	$rounds=64;
 	$reg_t="w";
+} else {
+  die "Unknown variant: $variant";
 }
 
 $func="sha${BITS}_block_data_order";
@@ -152,6 +168,7 @@ $code.=<<___;
 
 .text
 
+.extern	OPENSSL_armcap_P
 .globl	$func
 .type	$func,%function
 .align	6
@@ -181,7 +198,7 @@ $code.=<<___;
 	ldp	$E,$F,[$ctx,#4*$SZ]
 	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
 	ldp	$G,$H,[$ctx,#6*$SZ]
-	adr	$Ktbl,K$BITS
+	adr	$Ktbl,.LK$BITS
 	stp	$ctx,$num,[x29,#96]
 
 .Loop:
@@ -231,8 +248,8 @@ $code.=<<___;
 .size	$func,.-$func
 
 .align	6
-.type	K$BITS,%object
-K$BITS:
+.type	.LK$BITS,%object
+.LK$BITS:
 ___
 $code.=<<___ if ($SZ==8);
 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
@@ -297,7 +314,7 @@ $code.=<<___ if ($SZ==4);
 	.long	0	//terminator
 ___
 $code.=<<___;
-.size	K$BITS,.-K$BITS
+.size	.LK$BITS,.-.LK$BITS
 .align	3
 .LOPENSSL_armcap_P:
 	.quad	OPENSSL_armcap_P-.
@@ -322,7 +339,7 @@ sha256_block_armv8:
 	add		x29,sp,#0
 
 	ld1.32		{$ABCD,$EFGH},[$ctx]
-	adr		$Ktbl,K256
+	adr		$Ktbl,.LK256
 
 .Loop_hw:
 	ld1		{@MSG[0]-@MSG[3]},[$inp],#64