1 files changed, 20 insertions, 3 deletions
diff --git a/src/crypto/sha/asm/sha1-armv4-large.pl b/src/crypto/sha/asm/sha1-armv4-large.pl
index 1ffa041..a20d336 100644
--- a/src/crypto/sha/asm/sha1-armv4-large.pl
+++ b/src/crypto/sha/asm/sha1-armv4-large.pl
@@ -60,14 +60,28 @@
 # is ~2.5x larger and there are some redundant instructions executed
 # when processing last block, improvement is not as big for smallest
 # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
-# byte, which is also >80% faster than integer-only code.
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
 
 # May 2014.
 #
 # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $ctx="r0";
 $inp="r1";
@@ -178,6 +192,9 @@ sha1_block_data_order:
 	sub	r3,pc,#8		@ sha1_block_data_order
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#ARMV8_SHA1
 	bne	.LARMv8
 	tst	r12,#ARMV7_NEON