summaryrefslogtreecommitdiffstats
path: root/src/crypto/bn
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/bn')
-rw-r--r--src/crypto/bn/CMakeLists.txt76
-rw-r--r--src/crypto/bn/add.c394
-rw-r--r--src/crypto/bn/asm/armv4-mont.pl670
-rw-r--r--src/crypto/bn/asm/bn-586.pl774
-rw-r--r--src/crypto/bn/asm/co-586.pl287
-rw-r--r--src/crypto/bn/asm/rsaz-avx2.pl1898
-rw-r--r--src/crypto/bn/asm/rsaz-x86_64.pl2144
-rw-r--r--src/crypto/bn/asm/x86-mont.pl592
-rw-r--r--src/crypto/bn/asm/x86_64-gcc.c579
-rw-r--r--src/crypto/bn/asm/x86_64-mont.pl1401
-rw-r--r--src/crypto/bn/asm/x86_64-mont5.pl3499
-rw-r--r--src/crypto/bn/bn.c338
-rw-r--r--src/crypto/bn/bn_error.c63
-rw-r--r--src/crypto/bn/bn_test.c1471
-rw-r--r--src/crypto/bn/cmp.c200
-rw-r--r--src/crypto/bn/convert.c504
-rw-r--r--src/crypto/bn/ctx.c315
-rw-r--r--src/crypto/bn/div.c620
-rw-r--r--src/crypto/bn/exponentiation.c1535
-rw-r--r--src/crypto/bn/gcd.c704
-rw-r--r--src/crypto/bn/generic.c1120
-rw-r--r--src/crypto/bn/internal.h297
-rw-r--r--src/crypto/bn/kronecker.c175
-rw-r--r--src/crypto/bn/montgomery.c571
-rw-r--r--src/crypto/bn/mul.c886
-rw-r--r--src/crypto/bn/prime.c838
-rw-r--r--src/crypto/bn/random.c328
-rw-r--r--src/crypto/bn/rsaz_exp.c326
-rw-r--r--src/crypto/bn/rsaz_exp.h44
-rw-r--r--src/crypto/bn/shift.c287
-rw-r--r--src/crypto/bn/sqrt.c505
31 files changed, 23441 insertions, 0 deletions
diff --git a/src/crypto/bn/CMakeLists.txt b/src/crypto/bn/CMakeLists.txt
new file mode 100644
index 0000000..600be4d
--- /dev/null
+++ b/src/crypto/bn/CMakeLists.txt
@@ -0,0 +1,76 @@
+include_directories(. .. ../../include)
+
+if (${ARCH} STREQUAL "x86_64")
+ set(
+ BN_ARCH_SOURCES
+
+ asm/x86_64-gcc.c
+ x86_64-mont.${ASM_EXT}
+ x86_64-mont5.${ASM_EXT}
+ rsaz-x86_64.${ASM_EXT}
+ rsaz-avx2.${ASM_EXT}
+
+ rsaz_exp.c
+ )
+endif()
+
+if (${ARCH} STREQUAL "x86")
+ set(
+ BN_ARCH_SOURCES
+
+ bn-586.${ASM_EXT}
+ co-586.${ASM_EXT}
+ x86-mont.${ASM_EXT}
+ )
+endif()
+
+if (${ARCH} STREQUAL "arm")
+ set(
+ BN_ARCH_SOURCES
+
+ armv4-mont.${ASM_EXT}
+ )
+endif()
+
+add_library(
+ bn
+
+ OBJECT
+
+ bn_error.c
+ add.c
+ bn.c
+ cmp.c
+ convert.c
+ ctx.c
+ div.c
+ exponentiation.c
+ generic.c
+ gcd.c
+ kronecker.c
+ montgomery.c
+ mul.c
+ prime.c
+ random.c
+ shift.c
+ sqrt.c
+
+ ${BN_ARCH_SOURCES}
+)
+
+perlasm(x86_64-mont.${ASM_EXT} asm/x86_64-mont.pl)
+perlasm(x86_64-mont5.${ASM_EXT} asm/x86_64-mont5.pl)
+perlasm(rsaz-x86_64.${ASM_EXT} asm/rsaz-x86_64.pl)
+perlasm(rsaz-avx2.${ASM_EXT} asm/rsaz-avx2.pl)
+perlasm(bn-586.${ASM_EXT} asm/bn-586.pl)
+perlasm(co-586.${ASM_EXT} asm/co-586.pl)
+perlasm(x86-mont.${ASM_EXT} asm/x86-mont.pl)
+perlasm(armv4-mont.${ASM_EXT} asm/armv4-mont.pl)
+
+add_executable(
+ bn_test
+
+ bn_test.c
+)
+
+target_link_libraries(bn_test crypto)
diff --git a/src/crypto/bn/add.c b/src/crypto/bn/add.c
new file mode 100644
index 0000000..1c6b2d7
--- /dev/null
+++ b/src/crypto/bn/add.c
@@ -0,0 +1,394 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+ const BIGNUM *tmp;
+ int a_neg = a->neg, ret;
+
+ /* a + b a+b
+ * a + -b a-b
+ * -a + b b-a
+ * -a + -b -(a+b)
+ */
+ if (a_neg ^ b->neg) {
+ /* only one is negative */
+ if (a_neg) {
+ tmp = a;
+ a = b;
+ b = tmp;
+ }
+
+ /* we are now a - b */
+ if (BN_ucmp(a, b) < 0) {
+ if (!BN_usub(r, b, a)) {
+ return 0;
+ }
+ r->neg = 1;
+ } else {
+ if (!BN_usub(r, a, b)) {
+ return 0;
+ }
+ r->neg = 0;
+ }
+ return 1;
+ }
+
+ ret = BN_uadd(r, a, b);
+ r->neg = a_neg;
+ return ret;
+}
+
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+ int max, min, dif;
+ BN_ULONG *ap, *bp, *rp, carry, t1, t2;
+ const BIGNUM *tmp;
+
+ if (a->top < b->top) {
+ tmp = a;
+ a = b;
+ b = tmp;
+ }
+ max = a->top;
+ min = b->top;
+ dif = max - min;
+
+ if (bn_wexpand(r, max + 1) == NULL) {
+ return 0;
+ }
+
+ r->top = max;
+
+ ap = a->d;
+ bp = b->d;
+ rp = r->d;
+
+ carry = bn_add_words(rp, ap, bp, min);
+ rp += min;
+ ap += min;
+ bp += min;
+
+ if (carry) {
+ while (dif) {
+ dif--;
+ t1 = *(ap++);
+ t2 = (t1 + 1) & BN_MASK2;
+ *(rp++) = t2;
+ if (t2) {
+ carry = 0;
+ break;
+ }
+ }
+ if (carry) {
+ /* carry != 0 => dif == 0 */
+ *rp = 1;
+ r->top++;
+ }
+ }
+
+ if (dif && rp != ap) {
+ while (dif--) {
+ /* copy remaining words if ap != rp */
+ *(rp++) = *(ap++);
+ }
+ }
+
+ r->neg = 0;
+ return 1;
+}
+
+int BN_add_word(BIGNUM *a, BN_ULONG w) {
+ BN_ULONG l;
+ int i;
+
+ w &= BN_MASK2;
+
+ /* degenerate case: w is zero */
+ if (!w) {
+ return 1;
+ }
+
+ /* degenerate case: a is zero */
+ if (BN_is_zero(a)) {
+ return BN_set_word(a, w);
+ }
+
+ /* handle 'a' when negative */
+ if (a->neg) {
+ a->neg = 0;
+ i = BN_sub_word(a, w);
+ if (!BN_is_zero(a)) {
+ a->neg = !(a->neg);
+ }
+ return i;
+ }
+
+ for (i = 0; w != 0 && i < a->top; i++) {
+ a->d[i] = l = (a->d[i] + w) & BN_MASK2;
+ w = (w > l) ? 1 : 0;
+ }
+
+ if (w && i == a->top) {
+ if (bn_wexpand(a, a->top + 1) == NULL) {
+ return 0;
+ }
+ a->top++;
+ a->d[i] = w;
+ }
+
+ return 1;
+}
+
+int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+ int max;
+ int add = 0, neg = 0;
+ const BIGNUM *tmp;
+
+ /* a - b a-b
+ * a - -b a+b
+ * -a - b -(a+b)
+ * -a - -b b-a
+ */
+ if (a->neg) {
+ if (b->neg) {
+ tmp = a;
+ a = b;
+ b = tmp;
+ } else {
+ add = 1;
+ neg = 1;
+ }
+ } else {
+ if (b->neg) {
+ add = 1;
+ neg = 0;
+ }
+ }
+
+ if (add) {
+ if (!BN_uadd(r, a, b)) {
+ return 0;
+ }
+
+ r->neg = neg;
+ return 1;
+ }
+
+ /* We are actually doing a - b :-) */
+
+ max = (a->top > b->top) ? a->top : b->top;
+ if (bn_wexpand(r, max) == NULL) {
+ return 0;
+ }
+
+ if (BN_ucmp(a, b) < 0) {
+ if (!BN_usub(r, b, a)) {
+ return 0;
+ }
+ r->neg = 1;
+ } else {
+ if (!BN_usub(r, a, b)) {
+ return 0;
+ }
+ r->neg = 0;
+ }
+
+ return 1;
+}
+
+int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+ int max, min, dif;
+ register BN_ULONG t1, t2, *ap, *bp, *rp;
+ int i, carry;
+
+ max = a->top;
+ min = b->top;
+ dif = max - min;
+
+ if (dif < 0) /* hmm... should not be happening */
+ {
+ OPENSSL_PUT_ERROR(BN, BN_usub, BN_R_ARG2_LT_ARG3);
+ return 0;
+ }
+
+ if (bn_wexpand(r, max) == NULL) {
+ return 0;
+ }
+
+ ap = a->d;
+ bp = b->d;
+ rp = r->d;
+
+ carry = 0;
+ for (i = min; i != 0; i--) {
+ t1 = *(ap++);
+ t2 = *(bp++);
+ if (carry) {
+ carry = (t1 <= t2);
+ t1 = (t1 - t2 - 1) & BN_MASK2;
+ } else {
+ carry = (t1 < t2);
+ t1 = (t1 - t2) & BN_MASK2;
+ }
+ *(rp++) = t1 & BN_MASK2;
+ }
+
+ if (carry) /* subtracted */
+ {
+ if (!dif) {
+ /* error: a < b */
+ return 0;
+ }
+
+ while (dif) {
+ dif--;
+ t1 = *(ap++);
+ t2 = (t1 - 1) & BN_MASK2;
+ *(rp++) = t2;
+ if (t1) {
+ break;
+ }
+ }
+ }
+
+ if (rp != ap) {
+ for (;;) {
+ if (!dif--) {
+ break;
+ }
+ rp[0] = ap[0];
+ if (!dif--) {
+ break;
+ }
+ rp[1] = ap[1];
+ if (!dif--) {
+ break;
+ }
+ rp[2] = ap[2];
+ if (!dif--) {
+ break;
+ }
+ rp[3] = ap[3];
+ rp += 4;
+ ap += 4;
+ }
+ }
+
+ r->top = max;
+ r->neg = 0;
+ bn_correct_top(r);
+
+ return 1;
+}
+
+int BN_sub_word(BIGNUM *a, BN_ULONG w) {
+ int i;
+
+ w &= BN_MASK2;
+
+ /* degenerate case: w is zero */
+ if (!w) {
+ return 1;
+ }
+
+ /* degenerate case: a is zero */
+ if (BN_is_zero(a)) {
+ i = BN_set_word(a, w);
+ if (i != 0) {
+ BN_set_negative(a, 1);
+ }
+ return i;
+ }
+
+ /* handle 'a' when negative */
+ if (a->neg) {
+ a->neg = 0;
+ i = BN_add_word(a, w);
+ a->neg = 1;
+ return i;
+ }
+
+ if ((a->top == 1) && (a->d[0] < w)) {
+ a->d[0] = w - a->d[0];
+ a->neg = 1;
+ return 1;
+ }
+
+ i = 0;
+ for (;;) {
+ if (a->d[i] >= w) {
+ a->d[i] -= w;
+ break;
+ } else {
+ a->d[i] = (a->d[i] - w) & BN_MASK2;
+ i++;
+ w = 1;
+ }
+ }
+
+ if ((a->d[i] == 0) && (i == (a->top - 1))) {
+ a->top--;
+ }
+
+ return 1;
+}
diff --git a/src/crypto/bn/asm/armv4-mont.pl b/src/crypto/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000..5cc1328
--- /dev/null
+++ b/src/crypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,670 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$num="r0"; # starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10"; # sl, gcc uses it to keep @GOT
+$ahi="r11"; # fp
+$nlo="r12"; # ip
+########### # r13 is stack pointer
+$nhi="r14"; # lr
+########### # r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4"; $_bpend=$_num;
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code 32
+
+#if __ARM_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-bn_mul_mont
+#endif
+
+.global bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,%function
+
+.align 5
+bn_mul_mont:
+ ldr ip,[sp,#4] @ load num
+ stmdb sp!,{r0,r2} @ sp points at argument block
+#if __ARM_ARCH__>=7
+ tst ip,#7
+ bne .Lialu
+ adr r0,bn_mul_mont
+ ldr r2,.LOPENSSL_armcap
+ ldr r0,[r0,r2]
+ tst r0,#1 @ NEON available?
+ ldmia sp, {r0,r2}
+ beq .Lialu
+ add sp,sp,#8
+ b bn_mul8x_mont_neon
+.align 4
+.Lialu:
+#endif
+ cmp ip,#2
+ mov $num,ip @ load num
+ movlt r0,#0
+ addlt sp,sp,#2*4
+ blt .Labrt
+
+ stmdb sp!,{r4-r12,lr} @ save 10 registers
+
+ mov $num,$num,lsl#2 @ rescale $num for byte count
+ sub sp,sp,$num @ alloca(4*num)
+ sub sp,sp,#4 @ +extra dword
+ sub $num,$num,#4 @ "num=num-1"
+ add $tp,$bp,$num @ &bp[num-1]
+
+ add $num,sp,$num @ $num to point at &tp[num-1]
+ ldr $n0,[$_n0] @ &n0
+ ldr $bi,[$bp] @ bp[0]
+ ldr $aj,[$ap],#4 @ ap[0],ap++
+ ldr $nj,[$np],#4 @ np[0],np++
+ ldr $n0,[$n0] @ *n0
+ str $tp,[$_bpend] @ save &bp[num]
+
+ umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
+ str $n0,[$_n0] @ save n0 value
+ mul $n0,$alo,$n0 @ "tp[0]"*n0
+ mov $nlo,#0
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
+ mov $tp,sp
+
+.L1st:
+ ldr $aj,[$ap],#4 @ ap[j],ap++
+ mov $alo,$ahi
+ ldr $nj,[$np],#4 @ np[j],np++
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
+ mov $nhi,#0
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
+ adds $nlo,$nlo,$alo
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
+ adc $nlo,$nhi,#0
+ cmp $tp,$num
+ bne .L1st
+
+ adds $nlo,$nlo,$ahi
+ ldr $tp,[$_bp] @ restore bp
+ mov $nhi,#0
+ ldr $n0,[$_n0] @ restore n0
+ adc $nhi,$nhi,#0
+ str $nlo,[$num] @ tp[num-1]=
+ str $nhi,[$num,#4] @ tp[num]=
+
+.Louter:
+ sub $tj,$num,sp @ "original" $num-1 value
+ sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
+ ldr $bi,[$tp,#4]! @ *(++bp)
+ sub $np,$np,$tj @ "rewind" np to &np[1]
+ ldr $aj,[$ap,#-4] @ ap[0]
+ ldr $alo,[sp] @ tp[0]
+ ldr $nj,[$np,#-4] @ np[0]
+ ldr $tj,[sp,#4] @ tp[1]
+
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
+ str $tp,[$_bp] @ save bp
+ mul $n0,$alo,$n0
+ mov $nlo,#0
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
+ mov $tp,sp
+
+.Linner:
+ ldr $aj,[$ap],#4 @ ap[j],ap++
+ adds $alo,$ahi,$tj @ +=tp[j]
+ ldr $nj,[$np],#4 @ np[j],np++
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
+ mov $nhi,#0
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
+ adc $ahi,$ahi,#0
+ ldr $tj,[$tp,#8] @ tp[j+1]
+ adds $nlo,$nlo,$alo
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
+ adc $nlo,$nhi,#0
+ cmp $tp,$num
+ bne .Linner
+
+ adds $nlo,$nlo,$ahi
+ mov $nhi,#0
+ ldr $tp,[$_bp] @ restore bp
+ adc $nhi,$nhi,#0
+ ldr $n0,[$_n0] @ restore n0
+ adds $nlo,$nlo,$tj
+ ldr $tj,[$_bpend] @ restore &bp[num]
+ adc $nhi,$nhi,#0
+ str $nlo,[$num] @ tp[num-1]=
+ str $nhi,[$num,#4] @ tp[num]=
+
+ cmp $tp,$tj
+ bne .Louter
+
+ ldr $rp,[$_rp] @ pull rp
+ add $num,$num,#4 @ $num to point at &tp[num]
+ sub $aj,$num,sp @ "original" num value
+ mov $tp,sp @ "rewind" $tp
+ mov $ap,$tp @ "borrow" $ap
+ sub $np,$np,$aj @ "rewind" $np to &np[0]
+
+ subs $tj,$tj,$tj @ "clear" carry flag
+.Lsub: ldr $tj,[$tp],#4
+ ldr $nj,[$np],#4
+ sbcs $tj,$tj,$nj @ tp[j]-np[j]
+ str $tj,[$rp],#4 @ rp[j]=
+ teq $tp,$num @ preserve carry
+ bne .Lsub
+ sbcs $nhi,$nhi,#0 @ upmost carry
+ mov $tp,sp @ "rewind" $tp
+ sub $rp,$rp,$aj @ "rewind" $rp
+
+ and $ap,$tp,$nhi
+ bic $np,$rp,$nhi
+ orr $ap,$ap,$np @ ap=borrow?tp:rp
+
+.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
+ str sp,[$tp],#4 @ zap tp
+ str $tj,[$rp],#4
+ cmp $tp,$num
+ bne .Lcopy
+
+ add sp,$num,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4-r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+.Labrt: tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+.size bn_mul_mont,.-bn_mul_mont
+___
+{
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero=&Dlo($Z);
+my $temp=&Dlo($Temp);
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4-r11}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load rest of parameter block
+
+ sub $toutptr,sp,#16
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ sub $toutptr,$toutptr,$num,lsl#4
+ vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
+ and $toutptr,$toutptr,#-64
+ vld1.32 {${M0}[0]}, [$n0,:32]
+ mov sp,$toutptr @ alloca
+ veor $zero,$zero,$zero
+ subs $inner,$num,#8
+ vzip.16 $Bi,$zero
+
+ vmull.u32 $A0xB,$Bi,${A0}[0]
+ vmull.u32 $A1xB,$Bi,${A0}[1]
+ vmull.u32 $A2xB,$Bi,${A1}[0]
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ vmull.u32 $A3xB,$Bi,${A1}[1]
+
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ veor $zero,$zero,$zero
+ vmul.u32 $Ni,$temp,$M0
+
+ vmull.u32 $A4xB,$Bi,${A2}[0]
+ vld1.32 {$N0-$N3}, [$nptr]!
+ vmull.u32 $A5xB,$Bi,${A2}[1]
+ vmull.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmull.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_1st
+
+ @ special case for num=8, everything is in register bank...
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ sub $outer,$num,#1
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vmov $Temp,$A0xB
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vmov $A0xB,$A1xB
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmov $A1xB,$A2xB
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vmov $A2xB,$A3xB
+ vmov $A3xB,$A4xB
+ vshr.u64 $temp,$temp,#16
+ vmov $A4xB,$A5xB
+ vmov $A5xB,$A6xB
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vmov $A6xB,$A7xB
+ veor $A7xB,$A7xB
+ vshr.u64 $temp,$temp,#16
+
+ b .LNEON_outer8
+
+.align 4
+.LNEON_outer8:
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ veor $zero,$zero,$zero
+ vzip.16 $Bi,$zero
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ veor $zero,$zero,$zero
+ subs $outer,$outer,#1
+ vmul.u32 $Ni,$temp,$M0
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vmov $Temp,$A0xB
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vmov $A0xB,$A1xB
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmov $A1xB,$A2xB
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vmov $A2xB,$A3xB
+ vmov $A3xB,$A4xB
+ vshr.u64 $temp,$temp,#16
+ vmov $A4xB,$A5xB
+ vmov $A5xB,$A6xB
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vmov $A6xB,$A7xB
+ veor $A7xB,$A7xB
+ vshr.u64 $temp,$temp,#16
+
+ bne .LNEON_outer8
+
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+ mov $toutptr,sp
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
+ mov $inner,$num
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+ add $tinptr,sp,#16
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+ b .LNEON_tail2
+
+.align 4
+.LNEON_1st:
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vld1.32 {$A0-$A3}, [$aptr]!
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ subs $inner,$inner,#8
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vld1.32 {$N0-$N1}, [$nptr]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+
+ vmull.u32 $A0xB,$Bi,${A0}[0]
+ vld1.32 {$N2-$N3}, [$nptr]!
+ vmull.u32 $A1xB,$Bi,${A0}[1]
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vmull.u32 $A2xB,$Bi,${A1}[0]
+ vmull.u32 $A3xB,$Bi,${A1}[1]
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+
+ vmull.u32 $A4xB,$Bi,${A2}[0]
+ vmull.u32 $A5xB,$Bi,${A2}[1]
+ vmull.u32 $A6xB,$Bi,${A3}[0]
+ vmull.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_1st
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ add $tinptr,sp,#16
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vld1.64 {$Temp}, [sp,:128]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ sub $outer,$num,#1
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vshr.u64 $temp,$temp,#16
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ veor $Z,$Z,$Z
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vst1.64 {$Z}, [$toutptr,:128]
+ vshr.u64 $temp,$temp,#16
+
+ b .LNEON_outer
+
+.align 4
+.LNEON_outer:
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
+ vld1.32 {$A0-$A3}, [$aptr]!
+ veor $zero,$zero,$zero
+ mov $toutptr,sp
+ vzip.16 $Bi,$zero
+ sub $inner,$num,#8
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
+ veor $zero,$zero,$zero
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
+ vld1.64 {$A7xB},[$tinptr,:128]!
+ vmul.u32 $Ni,$temp,$M0
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vld1.32 {$N0-$N3}, [$nptr]!
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+.LNEON_inner:
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ vld1.32 {$A0-$A3}, [$aptr]!
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ subs $inner,$inner,#8
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
+ vld1.32 {$N0-$N3}, [$nptr]!
+
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
+ vld1.64 {$A7xB}, [$tinptr, :128]!
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
+
+ bne .LNEON_inner
+
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
+ add $tinptr,sp,#16
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
+ vld1.64 {$Temp}, [sp,:128]
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
+ subs $outer,$outer,#1
+
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vshr.u64 $temp,$temp,#16
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
+
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
+ vshr.u64 $temp,$temp,#16
+
+ bne .LNEON_outer
+
+ mov $toutptr,sp
+ mov $inner,$num
+
+.LNEON_tail:
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
+ vld1.64 {$A7xB}, [$tinptr, :128]!
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+.LNEON_tail2:
+ vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
+ vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A1xB")`,#16
+ vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A1xB")`,#16
+ vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
+
+ vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
+ vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A2xB")`,#16
+ vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A2xB")`,#16
+ vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
+
+ vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
+ vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A3xB")`,#16
+ vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A3xB")`,#16
+ vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
+
+ vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
+ vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A4xB")`,#16
+ vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A4xB")`,#16
+ vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
+
+ vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
+ vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A5xB")`,#16
+ vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
+ vshr.u64 $temp,`&Dhi("$A5xB")`,#16
+ vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
+
+ vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
+ vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A6xB")`,#16
+ vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
+ vld1.64 {$A0xB}, [$tinptr, :128]!
+ vshr.u64 $temp,`&Dhi("$A6xB")`,#16
+ vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
+
+ vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
+ vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,`&Dlo("$A7xB")`,#16
+ vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
+ vshr.u64 $temp,`&Dhi("$A7xB")`,#16
+ vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+ subs $inner,$inner,#8
+ vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
+
+ bne .LNEON_tail
+
+ vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
+ subs $aptr,sp,#0 @ clear carry flag
+ add $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+ ldmia $aptr!, {r4-r7}
+ ldmia $nptr!, {r8-r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_sub
+
+ ldr r10, [$aptr] @ load top-most bit
+ veor q0,q0,q0
+ sub r11,$bptr,sp @ this is num*4
+ veor q1,q1,q1
+ mov $aptr,sp
+ sub $rptr,$rptr,r11 @ rewind $rptr
+ mov $nptr,$bptr @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+.LNEON_copy_n_zap:
+ ldmia $aptr!, {r4-r7}
+ ldmia $rptr, {r8-r11}
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r11,r7
+ ldmia $aptr, {r4-r7}
+ stmia $rptr!, {r8-r11}
+ sub $aptr,$aptr,#16
+ ldmia $rptr, {r8-r11}
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ movcc r11,r7
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_copy_n_zap
+
+ sub sp,ip,#96
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r11}
+ bx lr
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;
diff --git a/src/crypto/bn/asm/bn-586.pl b/src/crypto/bn/asm/bn-586.pl
new file mode 100644
index 0000000..26d9bcb
--- /dev/null
+++ b/src/crypto/bn/asm/bn-586.pl
@@ -0,0 +1,774 @@
+#!/usr/bin/env perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_sub_part_words("bn_sub_part_words");
+
+&asm_finish();
+
+sub bn_mul_add_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("maw_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
+ &pxor("mm1","mm1"); # mm1 = carry_in
+ &jmp(&label("maw_sse2_entry"));
+
+ &set_label("maw_sse2_unrolled",16);
+ &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
+ &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
+ &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
+ &pmuludq("mm2","mm0"); # mm2 = w*a[0]
+ &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
+ &pmuludq("mm4","mm0"); # mm4 = w*a[1]
+ &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
+ &pmuludq("mm6","mm0"); # mm6 = w*a[2]
+ &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
+ &pmuludq("mm7","mm0"); # mm7 = w*a[3]
+ &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
+ &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
+ &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
+ &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
+ &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
+ &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
+ &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
+ &movd(&DWP(0,$r,"",0),"mm1");
+ &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
+ &pmuludq("mm2","mm0"); # mm2 = w*a[4]
+ &psrlq("mm1",32); # mm1 = carry0
+ &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
+ &pmuludq("mm4","mm0"); # mm4 = w*a[5]
+ &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
+ &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
+ &pmuludq("mm6","mm0"); # mm6 = w*a[6]
+ &movd(&DWP(4,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry1
+ &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
+ &add($a,32);
+ &pmuludq("mm3","mm0"); # mm3 = w*a[7]
+ &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
+ &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
+ &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
+ &movd(&DWP(8,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry2
+ &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
+ &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
+ &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
+ &movd(&DWP(12,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry3
+ &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
+ &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
+ &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
+ &movd(&DWP(16,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry4
+ &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
+ &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
+ &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
+ &movd(&DWP(20,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry5
+ &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
+ &movd(&DWP(24,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry6
+ &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
+ &movd(&DWP(28,$r,"",0),"mm1");
+ &lea($r,&DWP(32,$r));
+ &psrlq("mm1",32); # mm1 = carry_out
+
+ &sub($c,8);
+ &jz(&label("maw_sse2_exit"));
+ &set_label("maw_sse2_entry");
+ &test($c,0xfffffff8);
+ &jnz(&label("maw_sse2_unrolled"));
+
+ &set_label("maw_sse2_loop",4);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm3"); # carry += r[i]
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
+ &jnz(&label("maw_sse2_loop"));
+ &set_label("maw_sse2_exit");
+ &movd("eax","mm1"); # c = carry_out
+ &emms();
+ &ret();
+
+ &set_label("maw_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ebp";
+ $r="edi";
+ $c="esi";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+
+ &mov("ecx",&wparam(2)); #
+ &mov($a,&wparam(1)); #
+
+ &and("ecx",0xfffffff8); # num / 8
+ &mov($w,&wparam(3)); #
+
+ &push("ecx"); # Up the stack for a tmp variable
+
+ &jz(&label("maw_finish"));
+
+ &set_label("maw_loop",16);
+
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+
+ &mov("eax",&DWP($i,$a)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+= c
+ &adc("edx",0); # H(t)+=carry
+ &add("eax",&DWP($i,$r)); # L(t)+= *r
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i,$r),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ }
+
+ &comment("");
+ &sub("ecx",8);
+ &lea($a,&DWP(32,$a));
+ &lea($r,&DWP(32,$r));
+ &jnz(&label("maw_loop"));
+
+ &set_label("maw_finish",0);
+ &mov("ecx",&wparam(2)); # get num
+ &and("ecx",7);
+ &jnz(&label("maw_finish2")); # helps branch prediction
+ &jmp(&label("maw_end"));
+
+ &set_label("maw_finish2",1);
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ &adc("edx",0); # H(t)+=carry
+ &add("eax",&DWP($i*4,$r)); # L(t)+= *r
+ &adc("edx",0); # H(t)+=carry
+ &dec("ecx") if ($i != 7-1);
+ &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ &jz(&label("maw_end")) if ($i != 7-1);
+ }
+ &set_label("maw_end",0);
+ &mov("eax",$c);
+
+ &pop("ecx"); # clear variable from
+
+ &function_end($name);
+ }
+
+sub bn_mul_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("mw_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
+ &pxor("mm1","mm1"); # mm1 = carry = 0
+
+ &set_label("mw_sse2_loop",16);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
+ &jnz(&label("mw_sse2_loop"));
+
+ &movd("eax","mm1"); # return carry
+ &emms();
+ &ret();
+ &set_label("mw_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ecx";
+ $r="edi";
+ $c="esi";
+ $num="ebp";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+ &mov($a,&wparam(1)); #
+ &mov($num,&wparam(2)); #
+ &mov($w,&wparam(3)); #
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("mw_finish"));
+
+ &set_label("mw_loop",0);
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+
+ &mov("eax",&DWP($i,$a,"",0)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ # XXX
+
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
+
+ &mov($c,"edx"); # c= H(t);
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,32);
+ &sub($num,8);
+ &jz(&label("mw_finish"));
+ &jmp(&label("mw_loop"));
+
+ &set_label("mw_finish",0);
+ &mov($num,&wparam(2)); # get num
+ &and($num,7);
+ &jnz(&label("mw_finish2"));
+ &jmp(&label("mw_end"));
+
+ &set_label("mw_finish2",1);
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a,"",0));# *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ # XXX
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ &dec($num) if ($i != 7-1);
+ &jz(&label("mw_end")) if ($i != 7-1);
+ }
+ &set_label("mw_end",0);
+ &mov("eax",$c);
+
+ &function_end($name);
+ }
+
+sub bn_sqr_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("sqr_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+
+ &set_label("sqr_sse2_loop",16);
+ &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
+ &pmuludq("mm0","mm0"); # a[i] *= a[i]
+ &lea($a,&DWP(4,$a)); # a++
+ &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
+ &sub($c,1);
+ &lea($r,&DWP(8,$r)); # r += 2
+ &jnz(&label("sqr_sse2_loop"));
+
+ &emms();
+ &ret();
+ &set_label("sqr_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $r="esi";
+ $a="edi";
+ $num="ebx";
+
+ &mov($r,&wparam(0)); #
+ &mov($a,&wparam(1)); #
+ &mov($num,&wparam(2)); #
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("sw_finish"));
+
+ &set_label("sw_loop",0);
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+ &mov("eax",&DWP($i,$a,"",0)); # *a
+ # XXX
+ &mul("eax"); # *a * *a
+ &mov(&DWP($i*2,$r,"",0),"eax"); #
+ &mov(&DWP($i*2+4,$r,"",0),"edx");#
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,64);
+ &sub($num,8);
+ &jnz(&label("sw_loop"));
+
+ &set_label("sw_finish",0);
+ &mov($num,&wparam(2)); # get num
+ &and($num,7);
+ &jz(&label("sw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a,"",0)); # *a
+ # XXX
+ &mul("eax"); # *a * *a
+ &mov(&DWP($i*8,$r,"",0),"eax"); #
+ &dec($num) if ($i != 7-1);
+ &mov(&DWP($i*8+4,$r,"",0),"edx");
+ &jz(&label("sw_end")) if ($i != 7-1);
+ }
+ &set_label("sw_end",0);
+
+ &function_end($name);
+ }
+
+sub bn_div_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,"");
+ &mov("edx",&wparam(0)); #
+ &mov("eax",&wparam(1)); #
+ &mov("ecx",&wparam(2)); #
+ &div("ecx");
+ &ret();
+ &function_end_B($name);
+ }
+
+sub bn_add_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &add($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &add($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &add($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &add($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
+sub bn_sub_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
+sub bn_sub_part_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP(0,$a,"",0)); # *a
+ &mov($tmp2,&DWP(0,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP(0,$r,"",0),$tmp1); # *r
+ &add($a, 4);
+ &add($b, 4);
+ &add($r, 4);
+ &dec($num) if ($i != 6);
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+ &cmp(&wparam(4),0);
+ &je(&label("pw_end"));
+
+ &mov($num,&wparam(4)); # get dl
+ &cmp($num,0);
+ &je(&label("pw_end"));
+ &jge(&label("pw_pos"));
+
+ &comment("pw_neg");
+ &mov($tmp2,0);
+ &sub($tmp2,$num);
+ &mov($num,$tmp2);
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("pw_neg_finish"));
+
+ &set_label("pw_neg_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("dl<0 Round $i");
+
+ &mov($tmp1,0);
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("pw_neg_loop"));
+
+ &set_label("pw_neg_finish",0);
+ &mov($tmp2,&wparam(4)); # get dl
+ &mov($num,0);
+ &sub($num,$tmp2);
+ &and($num,7);
+ &jz(&label("pw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("dl<0 Tail Round $i");
+ &mov($tmp1,0);
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jz(&label("pw_end")) if ($i != 6);
+ }
+
+ &jmp(&label("pw_end"));
+
+ &set_label("pw_pos",0);
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("pw_pos_finish"));
+
+ &set_label("pw_pos_loop",0);
+
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("dl>0 Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &sub($tmp1,$c);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jnc(&label("pw_nc".$i));
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("pw_pos_loop"));
+
+ &set_label("pw_pos_finish",0);
+ &mov($num,&wparam(4)); # get dl
+ &and($num,7);
+ &jz(&label("pw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("dl>0 Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &sub($tmp1,$c);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jnc(&label("pw_tail_nc".$i));
+ &dec($num) if ($i != 6);
+ &jz(&label("pw_end")) if ($i != 6);
+ }
+ &mov($c,1);
+ &jmp(&label("pw_end"));
+
+ &set_label("pw_nc_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &set_label("pw_nc".$i,0);
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("pw_nc_loop"));
+
+ &mov($num,&wparam(4)); # get dl
+ &and($num,7);
+ &jz(&label("pw_nc_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &set_label("pw_tail_nc".$i,0);
+ &dec($num) if ($i != 6);
+ &jz(&label("pw_nc_end")) if ($i != 6);
+ }
+
+ &set_label("pw_nc_end",0);
+ &mov($c,0);
+
+ &set_label("pw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
diff --git a/src/crypto/bn/asm/co-586.pl b/src/crypto/bn/asm/co-586.pl
new file mode 100644
index 0000000..57101a6
--- /dev/null
+++ b/src/crypto/bn/asm/co-586.pl
@@ -0,0 +1,287 @@
+#!/usr/local/bin/perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+sub mul_add_c
+ {
+ local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("mul a[$ai]*b[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ &mul("edx");
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
+ &mov("eax",&wparam(0)) if $pos > 0; # load r[]
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
+ }
+
+sub sqr_add_c
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ }
+
+sub sqr_add_c2
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$a,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add("eax","eax");
+ ###
+ &adc("edx","edx");
+ ###
+ &adc($c2,0);
+ &add($c0,"eax");
+ &adc($c1,"edx");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ &adc($c2,0);
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+ ###
+ }
+
+sub bn_mul_comba
+ {
+ local($name,$num)=@_;
+ local($a,$b,$c0,$c1,$c2);
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($tot,$end);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $b="edi";
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ &push("esi");
+ &mov($a,&wparam(1));
+ &push("edi");
+ &mov($b,&wparam(2));
+ &push("ebp");
+ &push("ebx");
+
+ &xor($c0,$c0);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+ &xor($c1,$c1);
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("################## Calculate word $i");
+
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($j+1) == $end)
+ {
+ $v=1;
+ $v=2 if (($i+1) == $tot);
+ }
+ else
+ { $v=0; }
+ if (($j+1) != $end)
+ {
+ $na=($ai-1);
+ $nb=($bi+1);
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+ &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ # &mov("eax",&wparam(0));
+ # &mov(&DWP($i*4,"eax","",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &comment("save r[$i]");
+ # &mov("eax",&wparam(0));
+ &mov(&DWP($i*4,"eax","",0),$c0);
+
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
+
+sub bn_sqr_comba
+ {
+ local($name,$num)=@_;
+ local($r,$a,$c0,$c1,$c2)=@_;
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($b,$tot,$end,$half);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $r="edi";
+
+ &push("esi");
+ &push("edi");
+ &push("ebp");
+ &push("ebx");
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &xor($c0,$c0);
+ &xor($c1,$c1);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("############### Calculate word $i");
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($ai-1) < ($bi+1))
+ {
+ $v=1;
+ $v=2 if ($i+1) == $tot;
+ }
+ else
+ { $v=0; }
+ if (!$v)
+ {
+ $na=$ai-1;
+ $nb=$bi+1;
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+ if ($ai == $bi)
+ {
+ &sqr_add_c($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ else
+ {
+ &sqr_add_c2($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ #&mov(&DWP($i*4,$r,"",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ last;
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &mov(&DWP($i*4,$r,"",0),$c0);
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
diff --git a/src/crypto/bn/asm/rsaz-avx2.pl b/src/crypto/bn/asm/rsaz-avx2.pl
new file mode 100644
index 0000000..3b6ccf8
--- /dev/null
+++ b/src/crypto/bn/asm/rsaz-avx2.pl
@@ -0,0 +1,1898 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright (c) 2012, Intel Corporation #
+# #
+# All rights reserved. #
+# #
+# Redistribution and use in source and binary forms, with or without #
+# modification, are permitted provided that the following conditions are #
+# met: #
+# #
+# * Redistributions of source code must retain the above copyright #
+# notice, this list of conditions and the following disclaimer. #
+# #
+# * Redistributions in binary form must reproduce the above copyright #
+# notice, this list of conditions and the following disclaimer in the #
+# documentation and/or other materials provided with the #
+# distribution. #
+# #
+# * Neither the name of the Intel Corporation nor the names of its #
+# contributors may be used to endorse or promote products derived from #
+# this software without specific prior written permission. #
+# #
+# #
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
+# #
+##############################################################################
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel #
+# (2) University of Haifa, Israel #
+##############################################################################
+# Reference: #
+# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
+# Exponentiation, Using Advanced Vector Instructions Architectures", #
+# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
+# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
+# [2] S. Gueron: "Efficient Software Implementations of Modular #
+# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
+# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
+# Proceedings of 9th International Conference on Information Technology: #
+# New Generations (ITNG 2012), pp.821-823 (2012) #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
+# resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
+# on AVX2 capable x86_64 platforms", #
+# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
+##############################################################################
+#
+# +13% improvement over original submission by <appro@openssl.org>
+#
+# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
+# 2.3GHz Haswell 621 765/+23% 1113/+79%
+# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
+#
+# (*) if system doesn't support AVX2, for reference purposes;
+# (**) scaled to 2.3GHz to simplify comparison;
+# (***) scalar AD*X code is faster than AVX2 and is preferred code
+# path for Broadwell;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+ $addx = ($1>=2.23);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+ $addx = ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ $addx = ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+ $addx = ($ver>=3.03);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT = *OUT;
+
+if ($avx>1) {{{
+{ # void AMS_WW(
+my $rp="%rdi"; # BN_ULONG *rp,
+my $ap="%rsi"; # const BN_ULONG *ap,
+my $np="%rdx"; # const BN_ULONG *np,
+my $n0="%ecx"; # const BN_ULONG n0,
+my $rep="%r8d"; # int repeat);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+# Registers that hold the broadcasted words of bp, currently used
+my $B1="%ymm10";
+my $B2="%ymm11";
+# Registers that hold the broadcasted words of Y, currently used
+my $Y1="%ymm12";
+my $Y2="%ymm13";
+# Helper registers
+my $TEMP1="%ymm14";
+my $AND_MASK="%ymm15";
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d"; # loop counter
+my $tmp = "%r15";
+
+my $FrameSize=32*18+32*8; # place for A^2 and 2*A
+
+my $aap=$r0;
+my $tp0="%rbx";
+my $tp1=$r3;
+my $tpa=$tmp;
+
+$np="%r13"; # reassigned argument
+
+$code.=<<___;
+.text
+
+.globl rsaz_1024_sqr_avx2
+.type rsaz_1024_sqr_avx2,\@function,5
+.align 64
+rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
+ lea (%rsp), %rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ vmovaps %xmm6,-0xd8(%rax)
+ vmovaps %xmm7,-0xc8(%rax)
+ vmovaps %xmm8,-0xb8(%rax)
+ vmovaps %xmm9,-0xa8(%rax)
+ vmovaps %xmm10,-0x98(%rax)
+ vmovaps %xmm11,-0x88(%rax)
+ vmovaps %xmm12,-0x78(%rax)
+ vmovaps %xmm13,-0x68(%rax)
+ vmovaps %xmm14,-0x58(%rax)
+ vmovaps %xmm15,-0x48(%rax)
+.Lsqr_1024_body:
+___
+$code.=<<___;
+ mov %rax,%rbp
+ mov %rdx, $np # reassigned argument
+ sub \$$FrameSize, %rsp
+ mov $np, $tmp
+ sub \$-128, $rp # size optimization
+ sub \$-128, $ap
+ sub \$-128, $np
+
+ and \$4095, $tmp # see if $np crosses page
+ add \$32*10, $tmp
+ shr \$12, $tmp
+ vpxor $ACC9,$ACC9,$ACC9
+ jz .Lsqr_1024_no_n_copy
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause >2x performance degradation here, so if $np does
+ # cross page boundary, copy it to stack and make sure stack
+ # frame doesn't...
+ sub \$32*10,%rsp
+ vmovdqu 32*0-128($np), $ACC0
+ and \$-2048, %rsp
+ vmovdqu 32*1-128($np), $ACC1
+ vmovdqu 32*2-128($np), $ACC2
+ vmovdqu 32*3-128($np), $ACC3
+ vmovdqu 32*4-128($np), $ACC4
+ vmovdqu 32*5-128($np), $ACC5
+ vmovdqu 32*6-128($np), $ACC6
+ vmovdqu 32*7-128($np), $ACC7
+ vmovdqu 32*8-128($np), $ACC8
+ lea $FrameSize+128(%rsp),$np
+ vmovdqu $ACC0, 32*0-128($np)
+ vmovdqu $ACC1, 32*1-128($np)
+ vmovdqu $ACC2, 32*2-128($np)
+ vmovdqu $ACC3, 32*3-128($np)
+ vmovdqu $ACC4, 32*4-128($np)
+ vmovdqu $ACC5, 32*5-128($np)
+ vmovdqu $ACC6, 32*6-128($np)
+ vmovdqu $ACC7, 32*7-128($np)
+ vmovdqu $ACC8, 32*8-128($np)
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
+
+.Lsqr_1024_no_n_copy:
+ and \$-1024, %rsp
+
+ vmovdqu 32*1-128($ap), $ACC1
+ vmovdqu 32*2-128($ap), $ACC2
+ vmovdqu 32*3-128($ap), $ACC3
+ vmovdqu 32*4-128($ap), $ACC4
+ vmovdqu 32*5-128($ap), $ACC5
+ vmovdqu 32*6-128($ap), $ACC6
+ vmovdqu 32*7-128($ap), $ACC7
+ vmovdqu 32*8-128($ap), $ACC8
+
+ lea 192(%rsp), $tp0 # 64+128=192
+ vpbroadcastq .Land_mask(%rip), $AND_MASK
+ jmp .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+ lea 32*18+128(%rsp), $aap # size optimization
+ lea 448(%rsp), $tp1 # 64+128+256=448
+
+ # the squaring is performed as described in Variant B of
+ # "Speeding up Big-Number Squaring", so start by calculating
+ # the A*2=A+A vector
+ vpaddq $ACC1, $ACC1, $ACC1
+ vpbroadcastq 32*0-128($ap), $B1
+ vpaddq $ACC2, $ACC2, $ACC2
+ vmovdqa $ACC1, 32*0-128($aap)
+ vpaddq $ACC3, $ACC3, $ACC3
+ vmovdqa $ACC2, 32*1-128($aap)
+ vpaddq $ACC4, $ACC4, $ACC4
+ vmovdqa $ACC3, 32*2-128($aap)
+ vpaddq $ACC5, $ACC5, $ACC5
+ vmovdqa $ACC4, 32*3-128($aap)
+ vpaddq $ACC6, $ACC6, $ACC6
+ vmovdqa $ACC5, 32*4-128($aap)
+ vpaddq $ACC7, $ACC7, $ACC7
+ vmovdqa $ACC6, 32*5-128($aap)
+ vpaddq $ACC8, $ACC8, $ACC8
+ vmovdqa $ACC7, 32*6-128($aap)
+ vpxor $ACC9, $ACC9, $ACC9
+ vmovdqa $ACC8, 32*7-128($aap)
+
+ vpmuludq 32*0-128($ap), $B1, $ACC0
+ vpbroadcastq 32*1-128($ap), $B2
+ vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
+ vpmuludq $B1, $ACC1, $ACC1
+ vmovdqu $ACC9, 32*10-448($tp1)
+ vpmuludq $B1, $ACC2, $ACC2
+ vmovdqu $ACC9, 32*11-448($tp1)
+ vpmuludq $B1, $ACC3, $ACC3
+ vmovdqu $ACC9, 32*12-448($tp1)
+ vpmuludq $B1, $ACC4, $ACC4
+ vmovdqu $ACC9, 32*13-448($tp1)
+ vpmuludq $B1, $ACC5, $ACC5
+ vmovdqu $ACC9, 32*14-448($tp1)
+ vpmuludq $B1, $ACC6, $ACC6
+ vmovdqu $ACC9, 32*15-448($tp1)
+ vpmuludq $B1, $ACC7, $ACC7
+ vmovdqu $ACC9, 32*16-448($tp1)
+ vpmuludq $B1, $ACC8, $ACC8
+ vpbroadcastq 32*2-128($ap), $B1
+ vmovdqu $ACC9, 32*17-448($tp1)
+
+ mov $ap, $tpa
+ mov \$4, $i
+ jmp .Lsqr_entry_1024
+___
+$TEMP0=$Y1;
+$TEMP2=$Y2;
+$code.=<<___;
+.align 32
+.LOOP_SQR_1024:
+ vpbroadcastq 32*1-128($tpa), $B2
+ vpmuludq 32*0-128($ap), $B1, $ACC0
+ vpaddq 32*0-192($tp0), $ACC0, $ACC0
+ vpmuludq 32*0-128($aap), $B1, $ACC1
+ vpaddq 32*1-192($tp0), $ACC1, $ACC1
+ vpmuludq 32*1-128($aap), $B1, $ACC2
+ vpaddq 32*2-192($tp0), $ACC2, $ACC2
+ vpmuludq 32*2-128($aap), $B1, $ACC3
+ vpaddq 32*3-192($tp0), $ACC3, $ACC3
+ vpmuludq 32*3-128($aap), $B1, $ACC4
+ vpaddq 32*4-192($tp0), $ACC4, $ACC4
+ vpmuludq 32*4-128($aap), $B1, $ACC5
+ vpaddq 32*5-192($tp0), $ACC5, $ACC5
+ vpmuludq 32*5-128($aap), $B1, $ACC6
+ vpaddq 32*6-192($tp0), $ACC6, $ACC6
+ vpmuludq 32*6-128($aap), $B1, $ACC7
+ vpaddq 32*7-192($tp0), $ACC7, $ACC7
+ vpmuludq 32*7-128($aap), $B1, $ACC8
+ vpbroadcastq 32*2-128($tpa), $B1
+ vpaddq 32*8-192($tp0), $ACC8, $ACC8
+.Lsqr_entry_1024:
+ vmovdqu $ACC0, 32*0-192($tp0)
+ vmovdqu $ACC1, 32*1-192($tp0)
+
+ vpmuludq 32*1-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*1-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq 32*2-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq 32*3-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*4-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq 32*5-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq 32*6-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*7-128($aap), $B2, $ACC0
+ vpbroadcastq 32*3-128($tpa), $B2
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
+
+ vmovdqu $ACC2, 32*2-192($tp0)
+ vmovdqu $ACC3, 32*3-192($tp0)
+
+ vpmuludq 32*2-128($ap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq 32*2-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*3-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq 32*4-128($aap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq 32*5-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC0, $ACC0
+ vpmuludq 32*7-128($aap), $B1, $ACC1
+ vpbroadcastq 32*4-128($tpa), $B1
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
+
+ vmovdqu $ACC4, 32*4-192($tp0)
+ vmovdqu $ACC5, 32*5-192($tp0)
+
+ vpmuludq 32*3-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC6, $ACC6
+ vpmuludq 32*3-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC7, $ACC7
+ vpmuludq 32*4-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC8, $ACC8
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpmuludq 32*7-128($aap), $B2, $ACC2
+ vpbroadcastq 32*5-128($tpa), $B2
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
+
+ vmovdqu $ACC6, 32*6-192($tp0)
+ vmovdqu $ACC7, 32*7-192($tp0)
+
+ vpmuludq 32*4-128($ap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*4-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC0, $ACC0
+ vpmuludq 32*5-128($aap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*6-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*7-128($aap), $B1, $ACC3
+ vpbroadcastq 32*6-128($tpa), $B1
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
+
+ vmovdqu $ACC8, 32*8-192($tp0)
+ vmovdqu $ACC0, 32*9-192($tp0)
+ lea 8($tp0), $tp0
+
+ vpmuludq 32*5-128($ap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq 32*7-128($aap), $B2, $ACC4
+ vpbroadcastq 32*7-128($tpa), $B2
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
+
+ vmovdqu $ACC1, 32*10-448($tp1)
+ vmovdqu $ACC2, 32*11-448($tp1)
+
+ vpmuludq 32*6-128($ap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC3, $ACC3
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
+ vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
+ vpaddq $TEMP1, $ACC4, $ACC4
+ vpmuludq 32*7-128($aap), $B1, $ACC5
+ vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
+
+ vmovdqu $ACC3, 32*12-448($tp1)
+ vmovdqu $ACC4, 32*13-448($tp1)
+ lea 8($tpa), $tpa
+
+ vpmuludq 32*7-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*7-128($aap), $B2, $ACC6
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
+
+ vpmuludq 32*8-128($ap), $ACC0, $ACC7
+ vmovdqu $ACC5, 32*14-448($tp1)
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
+ vmovdqu $ACC6, 32*15-448($tp1)
+ vmovdqu $ACC7, 32*16-448($tp1)
+ lea 8($tp1), $tp1
+
+ dec $i
+ jnz .LOOP_SQR_1024
+___
+$ZERO = $ACC9;
+$TEMP0 = $B1;
+$TEMP2 = $B2;
+$TEMP3 = $Y1;
+$TEMP4 = $Y2;
+$code.=<<___;
+ #we need to fix indexes 32-39 to avoid overflow
+ vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
+ vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
+ vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
+ lea 192(%rsp), $tp0 # 64+128=192
+
+ vpsrlq \$29, $ACC8, $TEMP1
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpxor $ZERO, $ZERO, $ZERO
+ vpermq \$0x93, $TEMP2, $TEMP2
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpblendd \$3, $TEMP2, $ZERO, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vmovdqu $ACC1, 32*9-192($tp0)
+ vmovdqu $ACC2, 32*10-192($tp0)
+
+ mov (%rsp), %rax
+ mov 8(%rsp), $r1
+ mov 16(%rsp), $r2
+ mov 24(%rsp), $r3
+ vmovdqu 32*1(%rsp), $ACC1
+ vmovdqu 32*2-192($tp0), $ACC2
+ vmovdqu 32*3-192($tp0), $ACC3
+ vmovdqu 32*4-192($tp0), $ACC4
+ vmovdqu 32*5-192($tp0), $ACC5
+ vmovdqu 32*6-192($tp0), $ACC6
+ vmovdqu 32*7-192($tp0), $ACC7
+
+ mov %rax, $r0
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+ vmovd %eax, $Y1
+
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpbroadcastq $Y1, $Y1
+ add %rax, $r0
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ shr \$29, $r0
+ add %rax, $r1
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ add $r0, $r1
+ add %rax, $r2
+ imulq 24-128($np), %rdx
+ add %rdx, $r3
+
+ mov $r1, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ mov \$9, $i
+ jmp .LOOP_REDUCE_1024
+
+.align 32
+.LOOP_REDUCE_1024:
+ vmovd %eax, $Y2
+ vpbroadcastq $Y2, $Y2
+
+ vpmuludq 32*1-128($np), $Y1, $TEMP0
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpaddq $TEMP0, $ACC1, $ACC1
+ add %rax, $r1
+ vpmuludq 32*2-128($np), $Y1, $TEMP1
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ vpaddq $TEMP1, $ACC2, $ACC2
+ vpmuludq 32*3-128($np), $Y1, $TEMP2
+ .byte 0x67
+ add %rax, $r2
+ .byte 0x67
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ shr \$29, $r1
+ vpaddq $TEMP2, $ACC3, $ACC3
+ vpmuludq 32*4-128($np), $Y1, $TEMP0
+ add %rax, $r3
+ add $r1, $r2
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpmuludq 32*5-128($np), $Y1, $TEMP1
+ mov $r2, %rax
+ imull $n0, %eax
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpmuludq 32*6-128($np), $Y1, $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpmuludq 32*7-128($np), $Y1, $TEMP0
+ vpaddq $TEMP0, $ACC7, $ACC7
+ vpmuludq 32*8-128($np), $Y1, $TEMP1
+ vmovd %eax, $Y1
+ #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
+ vpaddq $TEMP1, $ACC8, $ACC8
+ #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
+ vpbroadcastq $Y1, $Y1
+
+ vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
+ vmovdqu 32*3-8-128($np), $TEMP1
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
+ vmovdqu 32*4-8-128($np), $TEMP2
+ add %rax, $r2
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ vpaddq $TEMP0, $ACC2, $ACC2
+ add $r3, %rax
+ shr \$29, $r2
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*5-8-128($np), $TEMP0
+ add $r2, %rax
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*6-8-128($np), $TEMP1
+ .byte 0x67
+ mov %rax, $r3
+ imull $n0, %eax
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*8-8-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*9-8-128($np), $ACC9
+ vmovd %eax, $ACC0 # borrow ACC0 for Y2
+ imulq -128($np), %rax
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vmovdqu 32*1-16-128($np), $TEMP1
+ vpbroadcastq $ACC0, $ACC0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq $Y2, $ACC9, $ACC9
+ vmovdqu 32*2-16-128($np), $TEMP2
+ add %rax, $r3
+
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+ vmovdqu 32*1-24-128($np), $ACC0
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ vmovdqu 32*3-16-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpmuludq $Y2, $ACC0, $ACC0
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
+ vpaddq $ACC1, $ACC0, $ACC0
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ vmovdqu 32*5-16-128($np), $TEMP2
+ .byte 0x67
+ vmovq $ACC0, %rax
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
+ vpaddq $TEMP0, $ACC3, $ACC3
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ vmovdqu 32*6-16-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC4, $ACC4
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ vmovdqu 32*7-16-128($np), $TEMP1
+ vpaddq $TEMP2, $ACC5, $ACC5
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ vmovdqu 32*8-16-128($np), $TEMP2
+ vpaddq $TEMP0, $ACC6, $ACC6
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ shr \$29, $r3
+ vmovdqu 32*9-16-128($np), $TEMP0
+ add $r3, %rax
+ vpaddq $TEMP1, $ACC7, $ACC7
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
+ mov %rax, $r0
+ imull $n0, %eax
+ vpaddq $TEMP2, $ACC8, $ACC8
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ and \$0x1fffffff, %eax
+ vmovd %eax, $Y1
+ vmovdqu 32*3-24-128($np), $TEMP2
+ .byte 0x67
+ vpaddq $TEMP0, $ACC9, $ACC9
+ vpbroadcastq $Y1, $Y1
+
+ vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
+ vmovdqu 32*4-24-128($np), $TEMP0
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ mov 8(%rsp), $r1
+ vpaddq $TEMP1, $ACC2, $ACC1
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*5-24-128($np), $TEMP1
+ add %rax, $r0
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ .byte 0x67
+ shr \$29, $r0
+ mov 16(%rsp), $r2
+ vpaddq $TEMP2, $ACC3, $ACC2
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vmovdqu 32*6-24-128($np), $TEMP2
+ add %rax, $r1
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ vpaddq $TEMP0, $ACC4, $ACC3
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*7-24-128($np), $TEMP0
+ imulq 24-128($np), %rdx # future $r3
+ add %rax, $r2
+ lea ($r0,$r1), %rax
+ vpaddq $TEMP1, $ACC5, $ACC4
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*8-24-128($np), $TEMP1
+ mov %rax, $r1
+ imull $n0, %eax
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vpaddq $TEMP2, $ACC6, $ACC5
+ vmovdqu 32*9-24-128($np), $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP0, $ACC7, $ACC6
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ add 24(%rsp), %rdx
+ vpaddq $TEMP1, $ACC8, $ACC7
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vpaddq $TEMP2, $ACC9, $ACC8
+ vmovq $r3, $ACC9
+ mov %rdx, $r3
+
+ dec $i
+ jnz .LOOP_REDUCE_1024
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+ lea 448(%rsp), $tp1 # size optimization
+ vpaddq $ACC9, $Y2, $ACC0
+ vpxor $ZERO, $ZERO, $ZERO
+
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
+ vpaddq 32*17-448($tp1), $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vmovdqu $ACC0, 32*0-128($rp)
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vmovdqu $ACC1, 32*1-128($rp)
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vmovdqu $ACC2, 32*2-128($rp)
+ vpaddq $TEMP4, $ACC4, $ACC4
+ vmovdqu $ACC3, 32*3-128($rp)
+___
+$TEMP5=$ACC0;
+$code.=<<___;
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vmovdqu $ACC4, 32*4-128($rp)
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vmovdqu $ACC5, 32*5-128($rp)
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vmovdqu $ACC6, 32*6-128($rp)
+ vpaddq $TEMP4, $ACC8, $ACC8
+ vmovdqu $ACC7, 32*7-128($rp)
+ vmovdqu $ACC8, 32*8-128($rp)
+
+ mov $rp, $ap
+ dec $rep
+ jne .LOOP_GRANDE_SQR_1024
+
+ vzeroall
+ mov %rbp, %rax
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lsqr_1024_epilogue:
+ ret
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}
+
+{ # void AMM_WW(
+my $rp="%rdi"; # BN_ULONG *rp,
+my $ap="%rsi"; # const BN_ULONG *ap,
+my $bp="%rdx"; # const BN_ULONG *bp,
+my $np="%rcx"; # const BN_ULONG *np,
+my $n0="%r8d"; # unsigned int n0);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+
+# Registers that hold the broadcasted words of multiplier, currently used
+my $Bi="%ymm10";
+my $Yi="%ymm11";
+
+# Helper registers
+my $TEMP0=$ACC0;
+my $TEMP1="%ymm12";
+my $TEMP2="%ymm13";
+my $ZERO="%ymm14";
+my $AND_MASK="%ymm15";
+
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d";
+my $tmp="%r15";
+
+$bp="%r13"; # reassigned argument
+
+$code.=<<___;
+.globl rsaz_1024_mul_avx2
+.type rsaz_1024_mul_avx2,\@function,5
+.align 64
+rsaz_1024_mul_avx2:
+ lea (%rsp), %rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ vzeroupper
+ lea -0xa8(%rsp),%rsp
+ vmovaps %xmm6,-0xd8(%rax)
+ vmovaps %xmm7,-0xc8(%rax)
+ vmovaps %xmm8,-0xb8(%rax)
+ vmovaps %xmm9,-0xa8(%rax)
+ vmovaps %xmm10,-0x98(%rax)
+ vmovaps %xmm11,-0x88(%rax)
+ vmovaps %xmm12,-0x78(%rax)
+ vmovaps %xmm13,-0x68(%rax)
+ vmovaps %xmm14,-0x58(%rax)
+ vmovaps %xmm15,-0x48(%rax)
+.Lmul_1024_body:
+___
+$code.=<<___;
+ mov %rax,%rbp
+ vzeroall
+ mov %rdx, $bp # reassigned argument
+ sub \$64,%rsp
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause severe performance degradation here, so if $ap does
+ # cross page boundary, swap it with $bp [meaning that caller
+ # is advised to lay down $ap and $bp next to each other, so
+ # that only one can cross page boundary].
+ .byte 0x67,0x67
+ mov $ap, $tmp
+ and \$4095, $tmp
+ add \$32*10, $tmp
+ shr \$12, $tmp
+ mov $ap, $tmp
+ cmovnz $bp, $ap
+ cmovnz $tmp, $bp
+
+ mov $np, $tmp
+ sub \$-128,$ap # size optimization
+ sub \$-128,$np
+ sub \$-128,$rp
+
+ and \$4095, $tmp # see if $np crosses page
+ add \$32*10, $tmp
+ .byte 0x67,0x67
+ shr \$12, $tmp
+ jz .Lmul_1024_no_n_copy
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause severe performance degradation here, so if $np does
+ # cross page boundary, copy it to stack and make sure stack
+ # frame doesn't...
+ sub \$32*10,%rsp
+ vmovdqu 32*0-128($np), $ACC0
+ and \$-512, %rsp
+ vmovdqu 32*1-128($np), $ACC1
+ vmovdqu 32*2-128($np), $ACC2
+ vmovdqu 32*3-128($np), $ACC3
+ vmovdqu 32*4-128($np), $ACC4
+ vmovdqu 32*5-128($np), $ACC5
+ vmovdqu 32*6-128($np), $ACC6
+ vmovdqu 32*7-128($np), $ACC7
+ vmovdqu 32*8-128($np), $ACC8
+ lea 64+128(%rsp),$np
+ vmovdqu $ACC0, 32*0-128($np)
+ vpxor $ACC0, $ACC0, $ACC0
+ vmovdqu $ACC1, 32*1-128($np)
+ vpxor $ACC1, $ACC1, $ACC1
+ vmovdqu $ACC2, 32*2-128($np)
+ vpxor $ACC2, $ACC2, $ACC2
+ vmovdqu $ACC3, 32*3-128($np)
+ vpxor $ACC3, $ACC3, $ACC3
+ vmovdqu $ACC4, 32*4-128($np)
+ vpxor $ACC4, $ACC4, $ACC4
+ vmovdqu $ACC5, 32*5-128($np)
+ vpxor $ACC5, $ACC5, $ACC5
+ vmovdqu $ACC6, 32*6-128($np)
+ vpxor $ACC6, $ACC6, $ACC6
+ vmovdqu $ACC7, 32*7-128($np)
+ vpxor $ACC7, $ACC7, $ACC7
+ vmovdqu $ACC8, 32*8-128($np)
+ vmovdqa $ACC0, $ACC8
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
+.Lmul_1024_no_n_copy:
+ and \$-64,%rsp
+
+ mov ($bp), %rbx
+ vpbroadcastq ($bp), $Bi
+ vmovdqu $ACC0, (%rsp) # clear top of stack
+ xor $r0, $r0
+ .byte 0x67
+ xor $r1, $r1
+ xor $r2, $r2
+ xor $r3, $r3
+
+ vmovdqu .Land_mask(%rip), $AND_MASK
+ mov \$9, $i
+ vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
+ jmp .Loop_mul_1024
+
+.align 32
+.Loop_mul_1024:
+ vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
+ mov %rbx, %rax
+ imulq -128($ap), %rax
+ add $r0, %rax
+ mov %rbx, $r1
+ imulq 8-128($ap), $r1
+ add 8(%rsp), $r1
+
+ mov %rax, $r0
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ mov %rbx, $r2
+ imulq 16-128($ap), $r2
+ add 16(%rsp), $r2
+
+ mov %rbx, $r3
+ imulq 24-128($ap), $r3
+ add 24(%rsp), $r3
+ vpmuludq 32*1-128($ap),$Bi,$TEMP0
+ vmovd %eax, $Yi
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq 32*2-128($ap),$Bi,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq 32*3-128($ap),$Bi,$TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq 32*4-128($ap),$Bi,$TEMP0
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq 32*5-128($ap),$Bi,$TEMP1
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq 32*6-128($ap),$Bi,$TEMP2
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq 32*7-128($ap),$Bi,$TEMP0
+ vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq 32*8-128($ap),$Bi,$TEMP1
+ vpbroadcastq 8($bp), $Bi
+ vpaddq $TEMP1,$ACC8,$ACC8
+
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r0
+ mov %rdx,%rax
+ imulq 8-128($np),%rax
+ add %rax,$r1
+ mov %rdx,%rax
+ imulq 16-128($np),%rax
+ add %rax,$r2
+ shr \$29, $r0
+ imulq 24-128($np),%rdx
+ add %rdx,$r3
+ add $r0, $r1
+
+ vpmuludq 32*1-128($np),$Yi,$TEMP2
+ vmovq $Bi, %rbx
+ vpaddq $TEMP2,$ACC1,$ACC1
+ vpmuludq 32*2-128($np),$Yi,$TEMP0
+ vpaddq $TEMP0,$ACC2,$ACC2
+ vpmuludq 32*3-128($np),$Yi,$TEMP1
+ vpaddq $TEMP1,$ACC3,$ACC3
+ vpmuludq 32*4-128($np),$Yi,$TEMP2
+ vpaddq $TEMP2,$ACC4,$ACC4
+ vpmuludq 32*5-128($np),$Yi,$TEMP0
+ vpaddq $TEMP0,$ACC5,$ACC5
+ vpmuludq 32*6-128($np),$Yi,$TEMP1
+ vpaddq $TEMP1,$ACC6,$ACC6
+ vpmuludq 32*7-128($np),$Yi,$TEMP2
+ vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
+ vpaddq $TEMP2,$ACC7,$ACC7
+ vpmuludq 32*8-128($np),$Yi,$TEMP0
+ vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
+ vpaddq $TEMP0,$ACC8,$ACC8
+
+ mov %rbx, %rax
+ imulq -128($ap),%rax
+ add %rax,$r1
+ vmovdqu -8+32*1-128($ap),$TEMP1
+ mov %rbx, %rax
+ imulq 8-128($ap),%rax
+ add %rax,$r2
+ vmovdqu -8+32*2-128($ap),$TEMP2
+
+ mov $r1, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ imulq 16-128($ap),%rbx
+ add %rbx,$r3
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovd %eax, $Yi
+ vmovdqu -8+32*3-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -8+32*4-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -8+32*5-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -8+32*6-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -8+32*7-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -8+32*8-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -8+32*9-128($ap),$ACC9
+ vpaddq $TEMP1,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpaddq $TEMP2,$ACC8,$ACC8
+ vpmuludq $Bi,$ACC9,$ACC9
+ vpbroadcastq 16($bp), $Bi
+
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r1
+ vmovdqu -8+32*1-128($np),$TEMP0
+ mov %rdx,%rax
+ imulq 8-128($np),%rax
+ add %rax,$r2
+ vmovdqu -8+32*2-128($np),$TEMP1
+ shr \$29, $r1
+ imulq 16-128($np),%rdx
+ add %rdx,$r3
+ add $r1, $r2
+
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -8+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -8+32*4-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -8+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -8+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -8+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -8+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -8+32*9-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ vmovdqu -16+32*1-128($ap),$TEMP0
+ mov %rbx,%rax
+ imulq -128($ap),%rax
+ add $r2,%rax
+
+ vmovdqu -16+32*2-128($ap),$TEMP1
+ mov %rax,$r2
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ imulq 8-128($ap),%rbx
+ add %rbx,$r3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovd %eax, $Yi
+ vmovdqu -16+32*3-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -16+32*4-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -16+32*5-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -16+32*6-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -16+32*7-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -16+32*8-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -16+32*9-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq 24($bp), $Bi
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ vmovdqu -16+32*1-128($np),$TEMP0
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r2
+ vmovdqu -16+32*2-128($np),$TEMP1
+ imulq 8-128($np),%rdx
+ add %rdx,$r3
+ shr \$29, $r2
+
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -16+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -16+32*4-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -16+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -16+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -16+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -16+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -16+32*9-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -24+32*1-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*2-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ add $r2, $r3
+ imulq -128($ap),%rbx
+ add %rbx,$r3
+
+ mov $r3, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovd %eax, $Yi
+ vmovdqu -24+32*3-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -24+32*4-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -24+32*5-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -24+32*6-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -24+32*7-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -24+32*8-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -24+32*9-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq 32($bp), $Bi
+ vpaddq $TEMP2,$ACC9,$ACC9
+ add \$32, $bp # $bp++
+
+ vmovdqu -24+32*1-128($np),$TEMP0
+ imulq -128($np),%rax
+ add %rax,$r3
+ shr \$29, $r3
+
+ vmovdqu -24+32*2-128($np),$TEMP1
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -24+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
+ vpaddq $TEMP1,$ACC2,$ACC1
+ vmovdqu -24+32*4-128($np),$TEMP0
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC2
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -24+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC3
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -24+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC4
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC5
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -24+32*9-128($np),$TEMP2
+ mov $r3, $r0
+ vpaddq $TEMP0,$ACC7,$ACC6
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ add (%rsp), $r0
+ vpaddq $TEMP1,$ACC8,$ACC7
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovq $r3, $TEMP1
+ vpaddq $TEMP2,$ACC9,$ACC8
+
+ dec $i
+ jnz .Loop_mul_1024
+___
+
+# (*) Original implementation was correcting ACC1-ACC3 for overflow
+# after 7 loop runs, or after 28 iterations, or 56 additions.
+# But as we underutilize resources, it's possible to correct in
+# each iteration with marginal performance loss. But then, as
+# we do it in each iteration, we can correct less digits, and
+# avoid performance penalties completely. Also note that we
+# correct only three digits out of four. This works because
+# most significant digit is subjected to less additions.
+
+$TEMP0 = $ACC9;
+$TEMP3 = $Bi;
+$TEMP4 = $Yi;
+$code.=<<___;
+ vpermq \$0, $AND_MASK, $AND_MASK
+ vpaddq (%rsp), $TEMP1, $ACC0
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vmovdqu $ACC0, 0-128($rp)
+ vmovdqu $ACC1, 32-128($rp)
+ vmovdqu $ACC2, 64-128($rp)
+ vmovdqu $ACC3, 96-128($rp)
+___
+
+$TEMP5=$ACC0;
+$code.=<<___;
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vmovdqu $ACC4, 128-128($rp)
+ vmovdqu $ACC5, 160-128($rp)
+ vmovdqu $ACC6, 192-128($rp)
+ vmovdqu $ACC7, 224-128($rp)
+ vmovdqu $ACC8, 256-128($rp)
+ vzeroupper
+
+ mov %rbp, %rax
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lmul_1024_epilogue:
+ ret
+.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+___
+}
+{
+my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
+my @T = map("%r$_",(8..11));
+
+$code.=<<___;
+.globl rsaz_1024_red2norm_avx2
+.type rsaz_1024_red2norm_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_red2norm_avx2:
+ sub \$-128,$inp # size optimization
+ xor %rax,%rax
+___
+
+for ($j=0,$i=0; $i<16; $i++) {
+ my $k=0;
+ while (29*$j<64*($i+1)) { # load data till boundary
+ $code.=" mov `8*$j-128`($inp), @T[0]\n";
+ $j++; $k++; push(@T,shift(@T));
+ }
+ $l=$k;
+ while ($k>1) { # shift loaded data but last value
+ $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
+ $k--;
+ }
+ $code.=<<___; # shift last value
+ mov @T[-1], @T[0]
+ shl \$`29*($j-1)`, @T[-1]
+ shr \$`-29*($j-1)`, @T[0]
+___
+ while ($l) { # accumulate all values
+ $code.=" add @T[-$l], %rax\n";
+ $l--;
+ }
+ $code.=<<___;
+ adc \$0, @T[0] # consume eventual carry
+ mov %rax, 8*$i($out)
+ mov @T[0], %rax
+___
+ push(@T,shift(@T));
+}
+$code.=<<___;
+ ret
+.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl rsaz_1024_norm2red_avx2
+.type rsaz_1024_norm2red_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_norm2red_avx2:
+ sub \$-128,$out # size optimization
+ mov ($inp),@T[0]
+ mov \$0x1fffffff,%eax
+___
+for ($j=0,$i=0; $i<16; $i++) {
+ $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
+ $code.=" xor @T[1],@T[1]\n" if ($i==15);
+ my $k=1;
+ while (29*($j+1)<64*($i+1)) {
+ $code.=<<___;
+ mov @T[0],@T[-$k]
+ shr \$`29*$j`,@T[-$k]
+ and %rax,@T[-$k] # &0x1fffffff
+ mov @T[-$k],`8*$j-128`($out)
+___
+ $j++; $k++;
+ }
+ $code.=<<___;
+ shrd \$`29*$j`,@T[1],@T[0]
+ and %rax,@T[0]
+ mov @T[0],`8*$j-128`($out)
+___
+ $j++;
+ push(@T,shift(@T));
+}
+$code.=<<___;
+ mov @T[0],`8*$j-128`($out) # zero
+ mov @T[0],`8*($j+1)-128`($out)
+ mov @T[0],`8*($j+2)-128`($out)
+ mov @T[0],`8*($j+3)-128`($out)
+ ret
+.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+___
+}
+{
+my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+
+$code.=<<___;
+.globl rsaz_1024_scatter5_avx2
+.type rsaz_1024_scatter5_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_scatter5_avx2:
+ vzeroupper
+ vmovdqu .Lscatter_permd(%rip),%ymm5
+ shl \$4,$power
+ lea ($out,$power),$out
+ mov \$9,%eax
+ jmp .Loop_scatter_1024
+
+.align 32
+.Loop_scatter_1024:
+ vmovdqu ($inp),%ymm0
+ lea 32($inp),$inp
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,($out)
+ lea 16*32($out),$out
+ dec %eax
+ jnz .Loop_scatter_1024
+
+ vzeroupper
+ ret
+.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_gather5_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_gather5_avx2:
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+ vzeroupper
+.LSEH_begin_rsaz_1024_gather5:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ lea .Lgather_table(%rip),%r11
+ mov $power,%eax
+ and \$3,$power
+ shr \$2,%eax # cache line number
+ shl \$4,$power # offset within cache line
+
+ vmovdqu -32(%r11),%ymm7 # .Lgather_permd
+ vpbroadcastb 8(%r11,%rax), %xmm8
+ vpbroadcastb 7(%r11,%rax), %xmm9
+ vpbroadcastb 6(%r11,%rax), %xmm10
+ vpbroadcastb 5(%r11,%rax), %xmm11
+ vpbroadcastb 4(%r11,%rax), %xmm12
+ vpbroadcastb 3(%r11,%rax), %xmm13
+ vpbroadcastb 2(%r11,%rax), %xmm14
+ vpbroadcastb 1(%r11,%rax), %xmm15
+
+ lea 64($inp,$power),$inp
+ mov \$64,%r11 # size optimization
+ mov \$9,%eax
+ jmp .Loop_gather_1024
+
+.align 32
+.Loop_gather_1024:
+ vpand -64($inp), %xmm8,%xmm0
+ vpand ($inp), %xmm9,%xmm1
+ vpand 64($inp), %xmm10,%xmm2
+ vpand ($inp,%r11,2), %xmm11,%xmm3
+ vpor %xmm0,%xmm1,%xmm1
+ vpand 64($inp,%r11,2), %xmm12,%xmm4
+ vpor %xmm2,%xmm3,%xmm3
+ vpand ($inp,%r11,4), %xmm13,%xmm5
+ vpor %xmm1,%xmm3,%xmm3
+ vpand 64($inp,%r11,4), %xmm14,%xmm6
+ vpor %xmm4,%xmm5,%xmm5
+ vpand -128($inp,%r11,8), %xmm15,%xmm2
+ lea ($inp,%r11,8),$inp
+ vpor %xmm3,%xmm5,%xmm5
+ vpor %xmm2,%xmm6,%xmm6
+ vpor %xmm5,%xmm6,%xmm6
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqu %ymm6,($out)
+ lea 32($out),$out
+ dec %eax
+ jnz .Loop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,($out)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_rsaz_1024_gather5:
+___
+$code.=<<___;
+ ret
+.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+___
+}
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,\@abi-omnipotent
+.align 32
+rsaz_avx2_eligible:
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
+___
+$code.=<<___ if ($addx);
+ mov \$`1<<8|1<<19`,%ecx
+ mov \$0,%edx
+ and %eax,%ecx
+ cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
+ cmove %edx,%eax
+___
+$code.=<<___;
+ and \$`1<<5`,%eax
+ shr \$5,%eax
+ ret
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.align 64
+.Land_mask:
+ .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
+.Lscatter_permd:
+ .long 0,2,4,6,7,7,7,7
+.Lgather_permd:
+ .long 0,7,1,7,2,7,3,7
+.Lgather_table:
+ .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.align 64
+___
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern __imp_RtlVirtualUnwind
+.type rsaz_se_handler,\@abi-omnipotent
+.align 16
+rsaz_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 160($context),%rax # pull context->Rbp
+
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ mov %r15,240($context)
+ mov %r14,232($context)
+ mov %r13,224($context)
+ mov %r12,216($context)
+ mov %rbp,160($context)
+ mov %rbx,144($context)
+
+ lea -0xd8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size rsaz_se_handler,.-rsaz_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_rsaz_1024_sqr_avx2
+ .rva .LSEH_end_rsaz_1024_sqr_avx2
+ .rva .LSEH_info_rsaz_1024_sqr_avx2
+
+ .rva .LSEH_begin_rsaz_1024_mul_avx2
+ .rva .LSEH_end_rsaz_1024_mul_avx2
+ .rva .LSEH_info_rsaz_1024_mul_avx2
+
+ .rva .LSEH_begin_rsaz_1024_gather5
+ .rva .LSEH_end_rsaz_1024_gather5
+ .rva .LSEH_info_rsaz_1024_gather5
+.section .xdata
+.align 8
+.LSEH_info_rsaz_1024_sqr_avx2:
+ .byte 9,0,0,0
+ .rva rsaz_se_handler
+ .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
+.LSEH_info_rsaz_1024_mul_avx2:
+ .byte 9,0,0,0
+ .rva rsaz_se_handler
+ .rva .Lmul_1024_body,.Lmul_1024_epilogue
+.LSEH_info_rsaz_1024_gather5:
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
+ .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
+ .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
+ .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
+ .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
+ .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
+ .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
+ .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
+ .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
+___
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
+
+ s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
+ print $_,"\n";
+}
+
+}}} else {{{
+print <<___; # assembler is too old
+.text
+
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,\@abi-omnipotent
+rsaz_avx2_eligible:
+ xor %eax,%eax
+ ret
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.globl rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.globl rsaz_1024_norm2red_avx2
+.globl rsaz_1024_red2norm_avx2
+.globl rsaz_1024_scatter5_avx2
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_sqr_avx2,\@abi-omnipotent
+rsaz_1024_sqr_avx2:
+rsaz_1024_mul_avx2:
+rsaz_1024_norm2red_avx2:
+rsaz_1024_red2norm_avx2:
+rsaz_1024_scatter5_avx2:
+rsaz_1024_gather5_avx2:
+ .byte 0x0f,0x0b # ud2
+ ret
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}}}
+
+close STDOUT;
diff --git a/src/crypto/bn/asm/rsaz-x86_64.pl b/src/crypto/bn/asm/rsaz-x86_64.pl
new file mode 100644
index 0000000..3bd45db
--- /dev/null
+++ b/src/crypto/bn/asm/rsaz-x86_64.pl
@@ -0,0 +1,2144 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright (c) 2012, Intel Corporation #
+# #
+# All rights reserved. #
+# #
+# Redistribution and use in source and binary forms, with or without #
+# modification, are permitted provided that the following conditions are #
+# met: #
+# #
+# * Redistributions of source code must retain the above copyright #
+# notice, this list of conditions and the following disclaimer. #
+# #
+# * Redistributions in binary form must reproduce the above copyright #
+# notice, this list of conditions and the following disclaimer in the #
+# documentation and/or other materials provided with the #
+# distribution. #
+# #
+# * Neither the name of the Intel Corporation nor the names of its #
+# contributors may be used to endorse or promote products derived from #
+# this software without specific prior written permission. #
+# #
+# #
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
+# #
+##############################################################################
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Architecture Group, Microprocessor and Chipset Development, #
+# Israel Development Center, Haifa, Israel #
+# (2) University of Haifa #
+##############################################################################
+# Reference: #
+# [1] S. Gueron, "Efficient Software Implementations of Modular #
+# Exponentiation", http://eprint.iacr.org/2011/239 #
+# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
+# IEEE Proceedings of 9th International Conference on Information #
+# Technology: New Generations (ITNG 2012), 821-823 (2012). #
+# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
+# Journal of Cryptographic Engineering 2:31-43 (2012). #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
+# resistant 512-bit and 1024-bit modular exponentiation for optimizing #
+# RSA1024 and RSA2048 on x86_64 platforms", #
+# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
+##############################################################################
+
+# While original submission covers 512- and 1024-bit exponentiation,
+# this module is limited to 512-bit version only (and as such
+# accelerates RSA1024 sign). This is because improvement for longer
+# keys is not high enough to justify the effort, highest measured
+# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
+# for the moment of this writing!] Nor does this module implement
+# "monolithic" complete exponentiation jumbo-subroutine, but adheres
+# to more modular mixture of C and assembly. And it's optimized even
+# for processors other than Intel Core family (see table below for
+# improvement coefficients).
+# <appro@openssl.org>
+#
+# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
+# ----------------+---------------------------
+# Opteron +13% |+5% +20%
+# Bulldozer -0% |-1% +10%
+# P4 +11% |+7% +8%
+# Westmere +5% |+14% +17%
+# Sandy Bridge +2% |+12% +29%
+# Ivy Bridge +1% |+11% +35%
+# Haswell(**) -0% |+12% +39%
+# Atom +13% |+11% +4%
+# VIA Nano +70% |+9% +25%
+#
+# (*) rsax engine and fips numbers are presented for reference
+# purposes;
+# (**) MULX was attempted, but found to give only marginal improvement;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $addx = ($1>=12);
+}
+
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $addx = ($ver>=3.03);
+}
+
+($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
+{
+my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl rsaz_512_sqr
+.type rsaz_512_sqr,\@function,5
+.align 32
+rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lsqr_body:
+ movq $mod, %rbp # common argument
+ movq ($inp), %rdx
+ movq 8($inp), %rax
+ movq $n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Loop_sqrx
+___
+$code.=<<___;
+ jmp .Loop_sqr
+
+.align 32
+.Loop_sqr:
+ movl $times,128+8(%rsp)
+#first iteration
+ movq %rdx, %rbx
+ mulq %rdx
+ movq %rax, %r8
+ movq 16($inp), %rax
+ movq %rdx, %r9
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 24($inp), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 32($inp), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 40($inp), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($inp), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 56($inp), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r14
+ movq %rbx, %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ addq %r8, %r8 #shlq \$1, %r8
+ movq %r9, %rcx
+ adcq %r9, %r9 #shld \$1, %r8, %r9
+
+ mulq %rax
+ movq %rax, (%rsp)
+ addq %rdx, %r8
+ adcq \$0, %r9
+
+ movq %r8, 8(%rsp)
+ shrq \$63, %rcx
+
+#second iteration
+ movq 8($inp), %r8
+ movq 16($inp), %rax
+ mulq %r8
+ addq %rax, %r10
+ movq 24($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r11
+ movq 32($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r11
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r12
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r12
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r13
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r13
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r14
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r14
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r15
+ movq %r8, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r15
+ movq %rdx, %r8
+ movq %r10, %rdx
+ adcq \$0, %r8
+
+ add %rdx, %rdx
+ lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+ movq %r11, %rbx
+ adcq %r11, %r11 #shld \$1, %r10, %r11
+
+ mulq %rax
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq \$0, %r11
+
+ movq %r9, 16(%rsp)
+ movq %r10, 24(%rsp)
+ shrq \$63, %rbx
+
+#third iteration
+ movq 16($inp), %r9
+ movq 24($inp), %rax
+ mulq %r9
+ addq %rax, %r12
+ movq 32($inp), %rax
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ addq %rax, %r13
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r13
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ addq %rax, %r14
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r14
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ movq %r12, %r10
+ lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
+ addq %rax, %r15
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r15
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ shrq \$63, %r10
+ addq %rax, %r8
+ movq %r9, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ movq %r13, %rcx
+ leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
+
+ mulq %rax
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq \$0, %r13
+
+ movq %r11, 32(%rsp)
+ movq %r12, 40(%rsp)
+ shrq \$63, %rcx
+
+#fourth iteration
+ movq 24($inp), %r10
+ movq 32($inp), %rax
+ mulq %r10
+ addq %rax, %r14
+ movq 40($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ addq %rax, %r15
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r15
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ movq %r14, %r12
+ leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
+ addq %rax, %r8
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r8
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ shrq \$63, %r12
+ addq %rax, %r9
+ movq %r10, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ movq %r15, %rbx
+ leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
+
+ mulq %rax
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq \$0, %r15
+
+ movq %r13, 48(%rsp)
+ movq %r14, 56(%rsp)
+ shrq \$63, %rbx
+
+#fifth iteration
+ movq 32($inp), %r11
+ movq 40($inp), %rax
+ mulq %r11
+ addq %rax, %r8
+ movq 48($inp), %rax
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r11
+ addq %rax, %r9
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ movq %r8, %r12
+ leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
+ addq %rcx, %r9
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r11
+ shrq \$63, %r12
+ addq %rax, %r10
+ movq %r11, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ movq %r9, %rcx
+ leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
+
+ mulq %rax
+ addq %rax, %r15
+ adcq %rdx, %r8
+ adcq \$0, %r9
+
+ movq %r15, 64(%rsp)
+ movq %r8, 72(%rsp)
+ shrq \$63, %rcx
+
+#sixth iteration
+ movq 40($inp), %r12
+ movq 48($inp), %rax
+ mulq %r12
+ addq %rax, %r10
+ movq 56($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r12
+ addq %rax, %r11
+ movq %r12, %rax
+ movq %r10, %r15
+ leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+ adcq \$0, %rdx
+ shrq \$63, %r15
+ addq %rbx, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ movq %r11, %rbx
+ leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
+
+ mulq %rax
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq \$0, %r11
+
+ movq %r9, 80(%rsp)
+ movq %r10, 88(%rsp)
+
+#seventh iteration
+ movq 48($inp), %r13
+ movq 56($inp), %rax
+ mulq %r13
+ addq %rax, %r12
+ movq %r13, %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ xorq %r14, %r14
+ shlq \$1, %rbx
+ adcq %r12, %r12 #shld \$1, %rbx, %r12
+ adcq %r13, %r13 #shld \$1, %r12, %r13
+ adcq %r14, %r14 #shld \$1, %r13, %r14
+
+ mulq %rax
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq \$0, %r13
+
+ movq %r11, 96(%rsp)
+ movq %r12, 104(%rsp)
+
+#eighth iteration
+ movq 56($inp), %rax
+ mulq %rax
+ addq %rax, %r13
+ adcq \$0, %rdx
+
+ addq %rdx, %r14
+
+ movq %r13, 112(%rsp)
+ movq %r14, 120(%rsp)
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8, %rdx
+ movq %r9, %rax
+ movl 128+8(%rsp), $times
+ movq $out, $inp
+
+ decl $times
+ jnz .Loop_sqr
+___
+if ($addx) {
+$code.=<<___;
+ jmp .Lsqr_tail
+
+.align 32
+.Loop_sqrx:
+ movl $times,128+8(%rsp)
+ movq $out, %xmm0 # off-load
+ movq %rbp, %xmm1 # off-load
+#first iteration
+ mulx %rax, %r8, %r9
+
+ mulx 16($inp), %rcx, %r10
+ xor %rbp, %rbp # cf=0, of=0
+
+ mulx 24($inp), %rax, %r11
+ adcx %rcx, %r9
+
+ mulx 32($inp), %rcx, %r12
+ adcx %rax, %r10
+
+ mulx 40($inp), %rax, %r13
+ adcx %rcx, %r11
+
+ .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
+ adcx %rax, %r12
+ adcx %rcx, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
+ adcx %rax, %r14
+ adcx %rbp, %r15 # %rbp is 0
+
+ mov %r9, %rcx
+ shld \$1, %r8, %r9
+ shl \$1, %r8
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rdx, %r8
+ mov 8($inp), %rdx
+ adcx %rbp, %r9
+
+ mov %rax, (%rsp)
+ mov %r8, 8(%rsp)
+
+#second iteration
+ mulx 16($inp), %rax, %rbx
+ adox %rax, %r10
+ adcx %rbx, %r11
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
+ adox $out, %r11
+ adcx %r8, %r12
+
+ mulx 32($inp), %rax, %rbx
+ adox %rax, %r12
+ adcx %rbx, %r13
+
+ mulx 40($inp), $out, %r8
+ adox $out, %r13
+ adcx %r8, %r14
+
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
+ adox %rax, %r14
+ adcx %rbx, %r15
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
+ adox $out, %r15
+ adcx %rbp, %r8
+ adox %rbp, %r8
+
+ mov %r11, %rbx
+ shld \$1, %r10, %r11
+ shld \$1, %rcx, %r10
+
+ xor %ebp,%ebp
+ mulx %rdx, %rax, %rcx
+ mov 16($inp), %rdx
+ adcx %rax, %r9
+ adcx %rcx, %r10
+ adcx %rbp, %r11
+
+ mov %r9, 16(%rsp)
+ .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
+
+#third iteration
+ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
+ adox $out, %r12
+ adcx %r9, %r13
+
+ mulx 32($inp), %rax, %rcx
+ adox %rax, %r13
+ adcx %rcx, %r14
+
+ mulx 40($inp), $out, %r9
+ adox $out, %r14
+ adcx %r9, %r15
+
+ .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
+ adox %rax, %r15
+ adcx %rcx, %r8
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
+ adox $out, %r8
+ adcx %rbp, %r9
+ adox %rbp, %r9
+
+ mov %r13, %rcx
+ shld \$1, %r12, %r13
+ shld \$1, %rbx, %r12
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r11
+ adcx %rdx, %r12
+ mov 24($inp), %rdx
+ adcx %rbp, %r13
+
+ mov %r11, 32(%rsp)
+ .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
+
+#fourth iteration
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
+ adox %rax, %r14
+ adcx %rbx, %r15
+
+ mulx 40($inp), $out, %r10
+ adox $out, %r15
+ adcx %r10, %r8
+
+ mulx 48($inp), %rax, %rbx
+ adox %rax, %r8
+ adcx %rbx, %r9
+
+ mulx 56($inp), $out, %r10
+ adox $out, %r9
+ adcx %rbp, %r10
+ adox %rbp, %r10
+
+ .byte 0x66
+ mov %r15, %rbx
+ shld \$1, %r14, %r15
+ shld \$1, %rcx, %r14
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r13
+ adcx %rdx, %r14
+ mov 32($inp), %rdx
+ adcx %rbp, %r15
+
+ mov %r13, 48(%rsp)
+ mov %r14, 56(%rsp)
+
+#fifth iteration
+ .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
+ adox $out, %r8
+ adcx %r11, %r9
+
+ mulx 48($inp), %rax, %rcx
+ adox %rax, %r9
+ adcx %rcx, %r10
+
+ mulx 56($inp), $out, %r11
+ adox $out, %r10
+ adcx %rbp, %r11
+ adox %rbp, %r11
+
+ mov %r9, %rcx
+ shld \$1, %r8, %r9
+ shld \$1, %rbx, %r8
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r15
+ adcx %rdx, %r8
+ mov 40($inp), %rdx
+ adcx %rbp, %r9
+
+ mov %r15, 64(%rsp)
+ mov %r8, 72(%rsp)
+
+#sixth iteration
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
+ adox %rax, %r10
+ adcx %rbx, %r11
+
+ .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
+ adox $out, %r11
+ adcx %rbp, %r12
+ adox %rbp, %r12
+
+ mov %r11, %rbx
+ shld \$1, %r10, %r11
+ shld \$1, %rcx, %r10
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r9
+ adcx %rdx, %r10
+ mov 48($inp), %rdx
+ adcx %rbp, %r11
+
+ mov %r9, 80(%rsp)
+ mov %r10, 88(%rsp)
+
+#seventh iteration
+ .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
+ adox %rax, %r12
+ adox %rbp, %r13
+
+ xor %r14, %r14
+ shld \$1, %r13, %r14
+ shld \$1, %r12, %r13
+ shld \$1, %rbx, %r12
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r11
+ adcx %rdx, %r12
+ mov 56($inp), %rdx
+ adcx %rbp, %r13
+
+ .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
+ .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
+
+#eighth iteration
+ mulx %rdx, %rax, %rdx
+ adox %rax, %r13
+ adox %rbp, %rdx
+
+ .byte 0x66
+ add %rdx, %r14
+
+ movq %r13, 112(%rsp)
+ movq %r14, 120(%rsp)
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8, %rdx
+ movq %r9, %rax
+ movl 128+8(%rsp), $times
+ movq $out, $inp
+
+ decl $times
+ jnz .Loop_sqrx
+
+.Lsqr_tail:
+___
+}
+$code.=<<___;
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lsqr_epilogue:
+ ret
+.size rsaz_512_sqr,.-rsaz_512_sqr
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$code.=<<___;
+.globl rsaz_512_mul
+.type rsaz_512_mul,\@function,5
+.align 32
+rsaz_512_mul:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lmul_body:
+ movq $out, %xmm0 # off-load arguments
+ movq $mod, %xmm1
+ movq $n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx
+___
+$code.=<<___;
+ movq ($bp), %rbx # pass b[0]
+ movq $bp, %rbp # pass argument
+ call __rsaz_512_mul
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_tail
+
+.align 32
+.Lmulx:
+ movq $bp, %rbp # pass argument
+ movq ($bp), %rdx # pass b[0]
+ call __rsaz_512_mulx
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+.Lmul_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_epilogue:
+ ret
+.size rsaz_512_mul,.-rsaz_512_mul
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl rsaz_512_mul_gather4
+.type rsaz_512_mul_gather4,\@function,6
+.align 32
+rsaz_512_mul_gather4:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $pwr, $pwr
+ subq \$128+24, %rsp
+.Lmul_gather4_body:
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx_gather
+___
+$code.=<<___;
+ movl 64($bp,$pwr,4), %eax
+ movq $out, %xmm0 # off-load arguments
+ movl ($bp,$pwr,4), %ebx
+ movq $mod, %xmm1
+ movq $n0, 128(%rsp)
+
+ shlq \$32, %rax
+ or %rax, %rbx
+ movq ($ap), %rax
+ movq 8($ap), %rcx
+ leaq 128($bp,$pwr,4), %rbp
+ mulq %rbx # 0 iteration
+ movq %rax, (%rsp)
+ movq %rcx, %rax
+ movq %rdx, %r8
+
+ mulq %rbx
+ movd (%rbp), %xmm4
+ addq %rax, %r8
+ movq 16($ap), %rax
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ movd 64(%rbp), %xmm5
+ addq %rax, %r9
+ movq 24($ap), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ pslldq \$4, %xmm5
+ addq %rax, %r10
+ movq 32($ap), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ por %xmm5, %xmm4
+ addq %rax, %r11
+ movq 40($ap), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($ap), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ leaq 128(%rbp), %rbp
+ addq %rax, %r13
+ movq 56($ap), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ movq %xmm4, %rbx
+ addq %rax, %r14
+ movq ($ap), %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8(%rsp), %rdi
+ movl \$7, %ecx
+ jmp .Loop_mul_gather
+
+.align 32
+.Loop_mul_gather:
+ mulq %rbx
+ addq %rax, %r8
+ movq 8($ap), %rax
+ movq %r8, (%rdi)
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ movd (%rbp), %xmm4
+ addq %rax, %r9
+ movq 16($ap), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ movd 64(%rbp), %xmm5
+ addq %rax, %r10
+ movq 24($ap), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ pslldq \$4, %xmm5
+ addq %rax, %r11
+ movq 32($ap), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ por %xmm5, %xmm4
+ addq %rax, %r12
+ movq 40($ap), %rax
+ adcq \$0, %rdx
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48($ap), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56($ap), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ movq %xmm4, %rbx
+ addq %rax, %r15
+ movq ($ap), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 128(%rbp), %rbp
+ leaq 8(%rdi), %rdi
+
+ decl %ecx
+ jnz .Loop_mul_gather
+
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, 32(%rdi)
+ movq %r13, 40(%rdi)
+ movq %r14, 48(%rdi)
+ movq %r15, 56(%rdi)
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_gather_tail
+
+.align 32
+.Lmulx_gather:
+ mov 64($bp,$pwr,4), %eax
+ movq $out, %xmm0 # off-load arguments
+ lea 128($bp,$pwr,4), %rbp
+ mov ($bp,$pwr,4), %edx
+ movq $mod, %xmm1
+ mov $n0, 128(%rsp)
+
+ shl \$32, %rax
+ or %rax, %rdx
+ mulx ($ap), %rbx, %r8 # 0 iteration
+ mov %rbx, (%rsp)
+ xor %edi, %edi # cf=0, of=0
+
+ mulx 8($ap), %rax, %r9
+ movd (%rbp), %xmm4
+
+ mulx 16($ap), %rbx, %r10
+ movd 64(%rbp), %xmm5
+ adcx %rax, %r8
+
+ mulx 24($ap), %rax, %r11
+ pslldq \$4, %xmm5
+ adcx %rbx, %r9
+
+ mulx 32($ap), %rbx, %r12
+ por %xmm5, %xmm4
+ adcx %rax, %r10
+
+ mulx 40($ap), %rax, %r13
+ adcx %rbx, %r11
+
+ mulx 48($ap), %rbx, %r14
+ lea 128(%rbp), %rbp
+ adcx %rax, %r12
+
+ mulx 56($ap), %rax, %r15
+ movq %xmm4, %rdx
+ adcx %rbx, %r13
+ adcx %rax, %r14
+ mov %r8, %rbx
+ adcx %rdi, %r15 # %rdi is 0
+
+ mov \$-7, %rcx
+ jmp .Loop_mulx_gather
+
+.align 32
+.Loop_mulx_gather:
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ mulx 8($ap), %rax, %r9
+ .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16($ap), %rax, %r10
+ movd 64(%rbp), %xmm5
+ lea 128(%rbp), %rbp
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
+ pslldq \$4, %xmm5
+ por %xmm5, %xmm4
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56($ap), %rax, %r15
+ movq %xmm4, %rdx
+ mov %rbx, 64(%rsp,%rcx,8)
+ adcx %rax, %r14
+ adox %rdi, %r15
+ mov %r8, %rbx
+ adcx %rdi, %r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Loop_mulx_gather
+
+ mov %r8, 64(%rsp)
+ mov %r9, 64+8(%rsp)
+ mov %r10, 64+16(%rsp)
+ mov %r11, 64+24(%rsp)
+ mov %r12, 64+32(%rsp)
+ mov %r13, 64+40(%rsp)
+ mov %r14, 64+48(%rsp)
+ mov %r15, 64+56(%rsp)
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ mov 128(%rsp), %rdx # pull $n0
+ mov (%rsp), %r8
+ mov 8(%rsp), %r9
+ mov 16(%rsp), %r10
+ mov 24(%rsp), %r11
+ mov 32(%rsp), %r12
+ mov 40(%rsp), %r13
+ mov 48(%rsp), %r14
+ mov 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+.Lmul_gather_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_gather4_epilogue:
+ ret
+.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
+___
+}
+{
+my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl rsaz_512_mul_scatter4
+.type rsaz_512_mul_scatter4,\@function,6
+.align 32
+rsaz_512_mul_scatter4:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $pwr, $pwr
+ subq \$128+24, %rsp
+.Lmul_scatter4_body:
+ leaq ($tbl,$pwr,4), $tbl
+ movq $out, %xmm0 # off-load arguments
+ movq $mod, %xmm1
+ movq $tbl, %xmm2
+ movq $n0, 128(%rsp)
+
+ movq $out, %rbp
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx_scatter
+___
+$code.=<<___;
+ movq ($out),%rbx # pass b[0]
+ call __rsaz_512_mul
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_scatter_tail
+
+.align 32
+.Lmulx_scatter:
+ movq ($out), %rdx # pass b[0]
+ call __rsaz_512_mulx
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+.Lmul_scatter_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ movq %xmm2, $inp
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movl %r8d, 64*0($inp) # scatter
+ shrq \$32, %r8
+ movl %r9d, 64*2($inp)
+ shrq \$32, %r9
+ movl %r10d, 64*4($inp)
+ shrq \$32, %r10
+ movl %r11d, 64*6($inp)
+ shrq \$32, %r11
+ movl %r12d, 64*8($inp)
+ shrq \$32, %r12
+ movl %r13d, 64*10($inp)
+ shrq \$32, %r13
+ movl %r14d, 64*12($inp)
+ shrq \$32, %r14
+ movl %r15d, 64*14($inp)
+ shrq \$32, %r15
+ movl %r8d, 64*1($inp)
+ movl %r9d, 64*3($inp)
+ movl %r10d, 64*5($inp)
+ movl %r11d, 64*7($inp)
+ movl %r12d, 64*9($inp)
+ movl %r13d, 64*11($inp)
+ movl %r14d, 64*13($inp)
+ movl %r15d, 64*15($inp)
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_scatter4_epilogue:
+ ret
+.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
+___
+}
+{
+my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
+$code.=<<___;
+.globl rsaz_512_mul_by_one
+.type rsaz_512_mul_by_one,\@function,4
+.align 32
+rsaz_512_mul_by_one:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lmul_by_one_body:
+___
+$code.=<<___ if ($addx);
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+___
+$code.=<<___;
+ movq $mod, %rbp # reassign argument
+ movq $n0, 128(%rsp)
+
+ movq ($inp), %r8
+ pxor %xmm0, %xmm0
+ movq 8($inp), %r9
+ movq 16($inp), %r10
+ movq 24($inp), %r11
+ movq 32($inp), %r12
+ movq 40($inp), %r13
+ movq 48($inp), %r14
+ movq 56($inp), %r15
+
+ movdqa %xmm0, (%rsp)
+ movdqa %xmm0, 16(%rsp)
+ movdqa %xmm0, 32(%rsp)
+ movdqa %xmm0, 48(%rsp)
+ movdqa %xmm0, 64(%rsp)
+ movdqa %xmm0, 80(%rsp)
+ movdqa %xmm0, 96(%rsp)
+___
+$code.=<<___ if ($addx);
+ andl \$0x80100,%eax
+ cmpl \$0x80100,%eax # check for MULX and ADO/CX
+ je .Lby_one_callx
+___
+$code.=<<___;
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lby_one_tail
+.align 32
+.Lby_one_callx:
+ movq 128(%rsp), %rdx # pull $n0
+ call __rsaz_512_reducex
+.Lby_one_tail:
+___
+$code.=<<___;
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_by_one_epilogue:
+ ret
+.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
+___
+}
+{ # __rsaz_512_reduce
+ #
+ # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
+ # output: %r8-%r15
+ # clobbers: everything except %rbp and %rdi
+$code.=<<___;
+.type __rsaz_512_reduce,\@abi-omnipotent
+.align 32
+__rsaz_512_reduce:
+ movq %r8, %rbx
+ imulq 128+8(%rsp), %rbx
+ movq 0(%rbp), %rax
+ movl \$8, %ecx
+ jmp .Lreduction_loop
+
+.align 32
+.Lreduction_loop:
+ mulq %rbx
+ movq 8(%rbp), %rax
+ negq %r8
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 16(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 24(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 32(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq 128+8(%rsp), %rsi
+ #movq %rdx, %r11
+ #adcq \$0, %r11
+ adcq \$0, %rdx
+ movq %rdx, %r11
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 40(%rbp), %rax
+ adcq \$0, %rdx
+ imulq %r8, %rsi
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ movq %rsi, %rbx
+ addq %rax, %r15
+ movq 0(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ decl %ecx
+ jne .Lreduction_loop
+
+ ret
+.size __rsaz_512_reduce,.-__rsaz_512_reduce
+___
+}
+if ($addx) {
+ # __rsaz_512_reducex
+ #
+ # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
+ # output: %r8-%r15
+ # clobbers: everything except %rbp and %rdi
+$code.=<<___;
+.type __rsaz_512_reducex,\@abi-omnipotent
+.align 32
+__rsaz_512_reducex:
+ #movq 128+8(%rsp), %rdx # pull $n0
+ imulq %r8, %rdx
+ xorq %rsi, %rsi # cf=0,of=0
+ movl \$8, %ecx
+ jmp .Lreduction_loopx
+
+.align 32
+.Lreduction_loopx:
+ mov %r8, %rbx
+ mulx 0(%rbp), %rax, %r8
+ adcx %rbx, %rax
+ adox %r9, %r8
+
+ mulx 8(%rbp), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16(%rbp), %rbx, %r10
+ adcx %rbx, %r9
+ adox %r11, %r10
+
+ mulx 24(%rbp), %rbx, %r11
+ adcx %rbx, %r10
+ adox %r12, %r11
+
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
+ mov %rdx, %rax
+ mov %r8, %rdx
+ adcx %rbx, %r11
+ adox %r13, %r12
+
+ mulx 128+8(%rsp), %rbx, %rdx
+ mov %rax, %rdx
+
+ mulx 40(%rbp), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56(%rbp), %rax, %r15
+ mov %rbx, %rdx
+ adcx %rax, %r14
+ adox %rsi, %r15 # %rsi is 0
+ adcx %rsi, %r15 # cf=0
+
+ decl %ecx # of=0
+ jne .Lreduction_loopx
+
+ ret
+.size __rsaz_512_reducex,.-__rsaz_512_reducex
+___
+}
+{ # __rsaz_512_subtract
+ # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
+ # output:
+ # clobbers: everything but %rdi, %rsi and %rbp
+$code.=<<___;
+.type __rsaz_512_subtract,\@abi-omnipotent
+.align 32
+__rsaz_512_subtract:
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ movq 0($mod), %r8
+ movq 8($mod), %r9
+ negq %r8
+ notq %r9
+ andq %rcx, %r8
+ movq 16($mod), %r10
+ andq %rcx, %r9
+ notq %r10
+ movq 24($mod), %r11
+ andq %rcx, %r10
+ notq %r11
+ movq 32($mod), %r12
+ andq %rcx, %r11
+ notq %r12
+ movq 40($mod), %r13
+ andq %rcx, %r12
+ notq %r13
+ movq 48($mod), %r14
+ andq %rcx, %r13
+ notq %r14
+ movq 56($mod), %r15
+ andq %rcx, %r14
+ notq %r15
+ andq %rcx, %r15
+
+ addq ($out), %r8
+ adcq 8($out), %r9
+ adcq 16($out), %r10
+ adcq 24($out), %r11
+ adcq 32($out), %r12
+ adcq 40($out), %r13
+ adcq 48($out), %r14
+ adcq 56($out), %r15
+
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ ret
+.size __rsaz_512_subtract,.-__rsaz_512_subtract
+___
+}
+{ # __rsaz_512_mul
+ #
+ # input: %rsi - ap, %rbp - bp
+ # ouput:
+ # clobbers: everything
+my ($ap,$bp) = ("%rsi","%rbp");
+$code.=<<___;
+.type __rsaz_512_mul,\@abi-omnipotent
+.align 32
+__rsaz_512_mul:
+ leaq 8(%rsp), %rdi
+
+ movq ($ap), %rax
+ mulq %rbx
+ movq %rax, (%rdi)
+ movq 8($ap), %rax
+ movq %rdx, %r8
+
+ mulq %rbx
+ addq %rax, %r8
+ movq 16($ap), %rax
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 24($ap), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 32($ap), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 40($ap), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($ap), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 56($ap), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r14
+ movq ($ap), %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8($bp), $bp
+ leaq 8(%rdi), %rdi
+
+ movl \$7, %ecx
+ jmp .Loop_mul
+
+.align 32
+.Loop_mul:
+ movq ($bp), %rbx
+ mulq %rbx
+ addq %rax, %r8
+ movq 8($ap), %rax
+ movq %r8, (%rdi)
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 16($ap), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 24($ap), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 32($ap), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 40($ap), %rax
+ adcq \$0, %rdx
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48($ap), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56($ap), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ leaq 8($bp), $bp
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r15
+ movq ($ap), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8(%rdi), %rdi
+
+ decl %ecx
+ jnz .Loop_mul
+
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, 32(%rdi)
+ movq %r13, 40(%rdi)
+ movq %r14, 48(%rdi)
+ movq %r15, 56(%rdi)
+
+ ret
+.size __rsaz_512_mul,.-__rsaz_512_mul
+___
+}
+if ($addx) {
+ # __rsaz_512_mulx
+ #
+ # input: %rsi - ap, %rbp - bp
+ # ouput:
+ # clobbers: everything
+my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
+$code.=<<___;
+.type __rsaz_512_mulx,\@abi-omnipotent
+.align 32
+__rsaz_512_mulx:
+ mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
+ mov \$-6, %rcx
+
+ mulx 8($ap), %rax, %r9
+ movq %rbx, 8(%rsp)
+
+ mulx 16($ap), %rbx, %r10
+ adc %rax, %r8
+
+ mulx 24($ap), %rax, %r11
+ adc %rbx, %r9
+
+ mulx 32($ap), %rbx, %r12
+ adc %rax, %r10
+
+ mulx 40($ap), %rax, %r13
+ adc %rbx, %r11
+
+ mulx 48($ap), %rbx, %r14
+ adc %rax, %r12
+
+ mulx 56($ap), %rax, %r15
+ mov 8($bp), %rdx
+ adc %rbx, %r13
+ adc %rax, %r14
+ adc \$0, %r15
+
+ xor $zero, $zero # cf=0,of=0
+ jmp .Loop_mulx
+
+.align 32
+.Loop_mulx:
+ movq %r8, %rbx
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ mulx 8($ap), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16($ap), %rax, %r10
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ mulx 24($ap), %rax, %r11
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56($ap), %rax, %r15
+ movq 64($bp,%rcx,8), %rdx
+ movq %rbx, 8+64-8(%rsp,%rcx,8)
+ adcx %rax, %r14
+ adox $zero, %r15
+ adcx $zero, %r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Loop_mulx
+
+ movq %r8, %rbx
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ mulx 24($ap), %rax, %r11
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
+ adcx %rax, %r14
+ adox $zero, %r15
+ adcx $zero, %r15
+
+ mov %rbx, 8+64-8(%rsp)
+ mov %r8, 8+64(%rsp)
+ mov %r9, 8+64+8(%rsp)
+ mov %r10, 8+64+16(%rsp)
+ mov %r11, 8+64+24(%rsp)
+ mov %r12, 8+64+32(%rsp)
+ mov %r13, 8+64+40(%rsp)
+ mov %r14, 8+64+48(%rsp)
+ mov %r15, 8+64+56(%rsp)
+
+ ret
+.size __rsaz_512_mulx,.-__rsaz_512_mulx
+___
+}
+{
+my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+$code.=<<___;
+.globl rsaz_512_scatter4
+.type rsaz_512_scatter4,\@abi-omnipotent
+.align 16
+rsaz_512_scatter4:
+ leaq ($out,$power,4), $out
+ movl \$8, %r9d
+ jmp .Loop_scatter
+.align 16
+.Loop_scatter:
+ movq ($inp), %rax
+ leaq 8($inp), $inp
+ movl %eax, ($out)
+ shrq \$32, %rax
+ movl %eax, 64($out)
+ leaq 128($out), $out
+ decl %r9d
+ jnz .Loop_scatter
+ ret
+.size rsaz_512_scatter4,.-rsaz_512_scatter4
+
+.globl rsaz_512_gather4
+.type rsaz_512_gather4,\@abi-omnipotent
+.align 16
+rsaz_512_gather4:
+ leaq ($inp,$power,4), $inp
+ movl \$8, %r9d
+ jmp .Loop_gather
+.align 16
+.Loop_gather:
+ movl ($inp), %eax
+ movl 64($inp), %r8d
+ leaq 128($inp), $inp
+ shlq \$32, %r8
+ or %r8, %rax
+ movq %rax, ($out)
+ leaq 8($out), $out
+ decl %r9d
+ jnz .Loop_gather
+ ret
+.size rsaz_512_gather4,.-rsaz_512_gather4
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 128+24+48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size sqr_handler,.-sqr_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_rsaz_512_sqr
+ .rva .LSEH_end_rsaz_512_sqr
+ .rva .LSEH_info_rsaz_512_sqr
+
+ .rva .LSEH_begin_rsaz_512_mul
+ .rva .LSEH_end_rsaz_512_mul
+ .rva .LSEH_info_rsaz_512_mul
+
+ .rva .LSEH_begin_rsaz_512_mul_gather4
+ .rva .LSEH_end_rsaz_512_mul_gather4
+ .rva .LSEH_info_rsaz_512_mul_gather4
+
+ .rva .LSEH_begin_rsaz_512_mul_scatter4
+ .rva .LSEH_end_rsaz_512_mul_scatter4
+ .rva .LSEH_info_rsaz_512_mul_scatter4
+
+ .rva .LSEH_begin_rsaz_512_mul_by_one
+ .rva .LSEH_end_rsaz_512_mul_by_one
+ .rva .LSEH_info_rsaz_512_mul_by_one
+
+.section .xdata
+.align 8
+.LSEH_info_rsaz_512_sqr:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_gather4:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_scatter4:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_by_one:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/crypto/bn/asm/x86-mont.pl b/src/crypto/bn/asm/x86-mont.pl
new file mode 100644
index 0000000..0626b48
--- /dev/null
+++ b/src/crypto/bn/asm/x86-mont.pl
@@ -0,0 +1,592 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi"; $tp="esi"; # overlapping variables!!!
+$rp="edi"; $bp="edi"; # overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp"); # stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32; # size of above frame rounded up to 16n
+
+ &xor ("eax","eax");
+ &mov ("edi",&wparam(5)); # int num
+ &cmp ("edi",4);
+ &jl (&label("just_leave"));
+
+ &lea ("esi",&wparam(0)); # put aside pointer to argument block
+ &lea ("edx",&wparam(1)); # load ap
+ &mov ("ebp","esp"); # saved stack pointer!
+ &add ("edi",2); # extra two words on top of tp
+ &neg ("edi");
+ &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
+ &neg ("edi");
+
+ # minimize cache contention by arraning 2K window between stack
+ # pointer and ap argument [np is also position sensitive vector,
+ # but it's assumed to be near ap, as it's allocated at ~same
+ # time].
+ &mov ("eax","esp");
+ &sub ("eax","edx");
+ &and ("eax",2047);
+ &sub ("esp","eax"); # this aligns sp and ap modulo 2048
+
+ &xor ("edx","esp");
+ &and ("edx",2048);
+ &xor ("edx",2048);
+ &sub ("esp","edx"); # this splits them apart modulo 4096
+
+ &and ("esp",-64); # align to cache line
+
+ ################################# load argument block...
+ &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+ &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+ &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+ &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+ &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+ #&mov ("edi",&DWP(5*4,"esi"));# int num
+
+ &mov ("esi",&DWP(0,"esi")); # pull n0[0]
+ &mov ($_rp,"eax"); # ... save a copy of argument block
+ &mov ($_ap,"ebx");
+ &mov ($_bp,"ecx");
+ &mov ($_np,"edx");
+ &mov ($_n0,"esi");
+ &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
+ #&mov ($_num,$num); # redundant as $num is not reused
+ &mov ($_sp,"ebp"); # saved stack pointer!
+
+if($sse2) {
+$acc0="mm0"; # mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt (&DWP(0,"eax"),26);
+ &jnc (&label("non_sse2"));
+
+ &mov ("eax",-1);
+ &movd ($mask,"eax"); # mask 32 lower bits
+
+ &mov ($ap,$_ap); # load input pointers
+ &mov ($bp,$_bp);
+ &mov ($np,$_np);
+
+ &xor ($i,$i); # i=0
+ &xor ($j,$j); # j=0
+
+ &movd ($mul0,&DWP(0,$bp)); # bp[0]
+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
+ &movd ($car1,&DWP(0,$np)); # np[0]
+
+ &pmuludq($mul1,$mul0); # ap[0]*bp[0]
+ &movq ($car0,$mul1);
+ &movq ($acc0,$mul1); # I wish movd worked for
+ &pand ($acc0,$mask); # inter-register transfers
+
+ &pmuludq($mul1,$_n0q); # *=n0
+
+ &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
+ &paddq ($car1,$acc0);
+
+ &movd ($acc1,&DWP(4,$np)); # np[1]
+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
+
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+
+ &inc ($j); # j++
+&set_label("1st",16);
+ &pmuludq($acc0,$mul0); # ap[j]*bp[0]
+ &pmuludq($acc1,$mul1); # np[j]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &pand ($acc0,$mask);
+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
+ &paddq ($car1,$acc0); # +=ap[j]*bp[0];
+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
+ &psrlq ($car0,32);
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
+ &psrlq ($car1,32);
+
+ &lea ($j,&DWP(1,$j));
+ &cmp ($j,$num);
+ &jl (&label("1st"));
+
+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
+ &pmuludq($acc1,$mul1); # np[num-1]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &pand ($acc0,$mask);
+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
+
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+
+ &paddq ($car1,$car0);
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
+
+ &inc ($i); # i++
+&set_label("outer");
+ &xor ($j,$j); # j=0
+
+ &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
+ &movd ($temp,&DWP($frame,"esp")); # tp[0]
+ &movd ($car1,&DWP(0,$np)); # np[0]
+ &pmuludq($mul1,$mul0); # ap[0]*bp[i]
+
+ &paddq ($mul1,$temp); # +=tp[0]
+ &movq ($acc0,$mul1);
+ &movq ($car0,$mul1);
+ &pand ($acc0,$mask);
+
+ &pmuludq($mul1,$_n0q); # *=n0
+
+ &pmuludq($car1,$mul1);
+ &paddq ($car1,$acc0);
+
+ &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
+ &movd ($acc1,&DWP(4,$np)); # np[1]
+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
+
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+ &paddq ($car0,$temp); # +=tp[1]
+
+ &inc ($j); # j++
+ &dec ($num);
+&set_label("inner");
+ &pmuludq($acc0,$mul0); # ap[j]*bp[i]
+ &pmuludq($acc1,$mul1); # np[j]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+ &pand ($acc0,$mask);
+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
+ &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
+ &psrlq ($car0,32);
+ &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+ &psrlq ($car1,32);
+ &paddq ($car0,$temp); # +=tp[j+1]
+
+ &dec ($num);
+ &lea ($j,&DWP(1,$j)); # j++
+ &jnz (&label("inner"));
+
+ &mov ($num,$j);
+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
+ &pmuludq($acc1,$mul1); # np[num-1]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &pand ($acc0,$mask);
+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+
+ &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
+ &paddq ($car1,$car0);
+ &paddq ($car1,$temp);
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
+
+ &lea ($i,&DWP(1,$i)); # i++
+ &cmp ($i,$num);
+ &jle (&label("outer"));
+
+ &emms (); # done with mmx bank
+ &jmp (&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+ &mov ("esp",$_sp);
+ &xor ("eax","eax"); # signal "not fast enough [yet]"
+ &jmp (&label("just_leave"));
+ # While the below code provides competitive performance for
+ # all key lengthes on modern Intel cores, it's still more
+ # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+ # means compared to the original integer-only assembler.
+ # 512-bit RSA sign is better by ~40%, but that's about all
+ # one can say about all CPUs...
+} else {
+$inp="esi"; # integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+ &mov ($inp,$_ap);
+ &lea ($carry,&DWP(1,$num));
+ &mov ($word,$_bp);
+ &xor ($j,$j); # j=0
+ &mov ("edx",$inp);
+ &and ($carry,1); # see if num is even
+ &sub ("edx",$word); # see if ap==bp
+ &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
+ &or ($carry,"edx");
+ &mov ($word,&DWP(0,$word)); # bp[0]
+ &jz (&label("bn_sqr_mont"));
+ &mov ($_bpend,"eax");
+ &mov ("eax",&DWP(0,$inp));
+ &xor ("edx","edx");
+
+&set_label("mull",16);
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*bp[0]
+ &add ($carry,"eax");
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
+ &cmp ($j,$num);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("mull"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*bp[0]
+ &mov ($word,$_n0);
+ &add ("eax",$carry);
+ &mov ($inp,$_np);
+ &adc ("edx",0);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
+ &xor ($j,$j);
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
+
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+ &adc ("edx",0);
+ &inc ($j);
+
+ &jmp (&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*bp[i]
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
+ &adc ("edx",0);
+ &cmp ($j,$num);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("1stmadd"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*bp[i]
+ &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &mov ($word,$_n0);
+ &adc ("edx",0);
+ &mov ($inp,$_np);
+ &add ($carry,"eax");
+ &adc ("edx",0);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &xor ($j,$j);
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
+ &adc ($j,0);
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
+
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+ &adc ("edx",0);
+ &mov ($j,1);
+
+&set_label("2ndmadd",16);
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
+ &adc ("edx",0);
+ &cmp ($j,$num);
+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
+ &jl (&label("2ndmadd"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
+
+ &xor ("eax","eax");
+ &mov ($j,$_bp); # &bp[i]
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
+ &lea ($j,&DWP(4,$j));
+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
+ &cmp ($j,$_bpend);
+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
+ &je (&label("common_tail"));
+
+ &mov ($word,&DWP(0,$j)); # bp[i+1]
+ &mov ($inp,$_ap);
+ &mov ($_bp,$j); # &bp[++i]
+ &xor ($j,$j);
+ &xor ("edx","edx");
+ &mov ("eax",&DWP(0,$inp));
+ &jmp (&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+ &mov ($_num,$num);
+ &mov ($_bp,$j); # i=0
+
+ &mov ("eax",$word); # ap[0]
+ &mul ($word); # ap[0]*ap[0]
+ &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
+ &mov ($sbit,"edx");
+ &shr ("edx",1);
+ &and ($sbit,1);
+ &inc ($j);
+&set_label("sqr",16);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*ap[0]
+ &add ("eax",$carry);
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &lea ($carry,&DWP(0,$sbit,"eax",2));
+ &shr ("eax",31);
+ &cmp ($j,$_num);
+ &mov ($sbit,"eax");
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("sqr"));
+
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*ap[0]
+ &add ("eax",$carry);
+ &mov ($word,$_n0);
+ &adc ("edx",0);
+ &mov ($inp,$_np);
+ &lea ($carry,&DWP(0,$sbit,"eax",2));
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+ &shr ("eax",31);
+ &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
+
+ &lea ($carry,&DWP(0,"eax","edx",2));
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &shr ("edx",31);
+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
+
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &mov ($num,$j);
+ &adc ("edx",0);
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+ &mov ($j,1);
+
+&set_label("3rdmadd",16);
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
+
+ &mov ($carry,"edx");
+ &mul ($word); # np[j+1]*m
+ &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
+ &lea ($j,&DWP(2,$j));
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
+ &adc ("edx",0);
+ &cmp ($j,$num);
+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("3rdmadd"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
+
+ &mov ($j,$_bp); # i
+ &xor ("eax","eax");
+ &mov ($inp,$_ap);
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
+ &cmp ($j,$num);
+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
+ &je (&label("common_tail"));
+
+ &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
+ &lea ($j,&DWP(1,$j));
+ &mov ("eax",$word);
+ &mov ($_bp,$j); # ++i
+ &mul ($word); # ap[i]*ap[i]
+ &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
+ &adc ("edx",0);
+ &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
+ &xor ($carry,$carry);
+ &cmp ($j,$num);
+ &lea ($j,&DWP(1,$j));
+ &je (&label("sqrlast"));
+
+ &mov ($sbit,"edx"); # zaps $num
+ &shr ("edx",1);
+ &and ($sbit,1);
+&set_label("sqradd",16);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*ap[i]
+ &add ("eax",$carry);
+ &lea ($carry,&DWP(0,"eax","eax"));
+ &adc ("edx",0);
+ &shr ("eax",31);
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &lea ($j,&DWP(1,$j));
+ &adc ("eax",0);
+ &add ($carry,$sbit);
+ &adc ("eax",0);
+ &cmp ($j,$_num);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &mov ($sbit,"eax");
+ &jle (&label("sqradd"));
+
+ &mov ($carry,"edx");
+ &add ("edx","edx");
+ &shr ($carry,31);
+ &add ("edx",$sbit);
+ &adc ($carry,0);
+&set_label("sqrlast");
+ &mov ($word,$_n0);
+ &mov ($inp,$_np);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &adc ($carry,0);
+ &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
+
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &lea ($num,&DWP(-1,$j));
+ &adc ("edx",0);
+ &mov ($j,1);
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+
+ &jmp (&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+ &mov ($np,$_np); # load modulus pointer
+ &mov ($rp,$_rp); # load result pointer
+ &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
+
+ &mov ("eax",&DWP(0,$tp)); # tp[0]
+ &mov ($j,$num); # j=num-1
+ &xor ($i,$i); # i=0 and clear CF!
+
+&set_label("sub",16);
+ &sbb ("eax",&DWP(0,$np,$i,4));
+ &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
+ &dec ($j); # doesn't affect CF!
+ &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
+ &lea ($i,&DWP(1,$i)); # i++
+ &jge (&label("sub"));
+
+ &sbb ("eax",0); # handle upmost overflow bit
+
+&set_label("copy",16); # copy or in-place refresh
+ &mov ("edx",&DWP(0,$tp,$num,4));
+ &mov ($np,&DWP(0,$rp,$num,4));
+ &xor ("edx",$np); # conditional select
+ &and ("edx","eax");
+ &xor ("edx",$np);
+ &mov (&DWP(0,$tp,$num,4),$j) # zap temporary vector
+ &mov (&DWP(0,$rp,$num,4),"edx"); # rp[i]=tp[i]
+ &dec ($num);
+ &jge (&label("copy"));
+
+ &mov ("esp",$_sp); # pull saved stack pointer
+ &mov ("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/src/crypto/bn/asm/x86_64-gcc.c b/src/crypto/bn/asm/x86_64-gcc.c
new file mode 100644
index 0000000..b3d1965
--- /dev/null
+++ b/src/crypto/bn/asm/x86_64-gcc.c
@@ -0,0 +1,579 @@
+#include <openssl/bn.h>
+
+#if defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS)
+
+#include "../internal.h"
+
+/* x86_64 BIGNUM accelerator version 0.1, December 2002.
+ *
+ * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project.
+ *
+ * Rights for redistribution and usage in source and binary forms are
+ * granted according to the OpenSSL license. Warranty of any kind is
+ * disclaimed.
+ *
+ * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
+ * versions, like 1.0...
+ * A. Well, that's because this code is basically a quick-n-dirty
+ * proof-of-concept hack. As you can see it's implemented with
+ * inline assembler, which means that you're bound to GCC and that
+ * there might be enough room for further improvement.
+ *
+ * Q. Why inline assembler?
+ * A. x86_64 features own ABI which I'm not familiar with. This is
+ * why I decided to let the compiler take care of subroutine
+ * prologue/epilogue as well as register allocation. For reference.
+ * Win64 implements different ABI for AMD64, different from Linux.
+ *
+ * Q. How much faster does it get?
+ * A. 'apps/openssl speed rsa dsa' output with no-asm:
+ *
+ * sign verify sign/s verify/s
+ * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
+ * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
+ * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
+ * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
+ * sign verify sign/s verify/s
+ * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
+ * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
+ * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
+ *
+ * 'apps/openssl speed rsa dsa' output with this module:
+ *
+ * sign verify sign/s verify/s
+ * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
+ * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
+ * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
+ * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
+ * sign verify sign/s verify/s
+ * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
+ * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
+ * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
+ *
+ * For the reference. IA-32 assembler implementation performs
+ * very much like 64-bit code compiled with no-asm on the same
+ * machine.
+ */
+
+ /* TODO(davidben): Get this file working on Windows x64. */
+
+#undef mul
+#undef mul_add
+
+#define asm __asm__
+
+/*
+ * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
+ * "g"(0) let the compiler to decide where does it
+ * want to keep the value of zero;
+ */
+#define mul_add(r, a, word, carry) \
+ do { \
+ register BN_ULONG high, low; \
+ asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
+ asm("addq %2,%0; adcq %3,%1" \
+ : "+r"(carry), "+d"(high) \
+ : "a"(low), "g"(0) \
+ : "cc"); \
+ asm("addq %2,%0; adcq %3,%1" \
+ : "+m"(r), "+d"(high) \
+ : "r"(carry), "g"(0) \
+ : "cc"); \
+ carry = high; \
+ } while (0)
+
+#define mul(r, a, word, carry) \
+ do { \
+ register BN_ULONG high, low; \
+ asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
+ asm("addq %2,%0; adcq %3,%1" \
+ : "+r"(carry), "+d"(high) \
+ : "a"(low), "g"(0) \
+ : "cc"); \
+ (r) = carry, carry = high; \
+ } while (0)
+#undef sqr
+#define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+ BN_ULONG w) {
+ BN_ULONG c1 = 0;
+
+ if (num <= 0)
+ return (c1);
+
+ while (num & ~3) {
+ mul_add(rp[0], ap[0], w, c1);
+ mul_add(rp[1], ap[1], w, c1);
+ mul_add(rp[2], ap[2], w, c1);
+ mul_add(rp[3], ap[3], w, c1);
+ ap += 4;
+ rp += 4;
+ num -= 4;
+ }
+ if (num) {
+ mul_add(rp[0], ap[0], w, c1);
+ if (--num == 0)
+ return c1;
+ mul_add(rp[1], ap[1], w, c1);
+ if (--num == 0)
+ return c1;
+ mul_add(rp[2], ap[2], w, c1);
+ return c1;
+ }
+
+ return (c1);
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
+ BN_ULONG c1 = 0;
+
+ if (num <= 0)
+ return (c1);
+
+ while (num & ~3) {
+ mul(rp[0], ap[0], w, c1);
+ mul(rp[1], ap[1], w, c1);
+ mul(rp[2], ap[2], w, c1);
+ mul(rp[3], ap[3], w, c1);
+ ap += 4;
+ rp += 4;
+ num -= 4;
+ }
+ if (num) {
+ mul(rp[0], ap[0], w, c1);
+ if (--num == 0)
+ return c1;
+ mul(rp[1], ap[1], w, c1);
+ if (--num == 0)
+ return c1;
+ mul(rp[2], ap[2], w, c1);
+ }
+ return (c1);
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
+ if (n <= 0)
+ return;
+
+ while (n & ~3) {
+ sqr(r[0], r[1], a[0]);
+ sqr(r[2], r[3], a[1]);
+ sqr(r[4], r[5], a[2]);
+ sqr(r[6], r[7], a[3]);
+ a += 4;
+ r += 8;
+ n -= 4;
+ }
+ if (n) {
+ sqr(r[0], r[1], a[0]);
+ if (--n == 0)
+ return;
+ sqr(r[2], r[3], a[1]);
+ if (--n == 0)
+ return;
+ sqr(r[4], r[5], a[2]);
+ }
+}
+
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+ BN_ULONG ret, waste;
+
+ asm("divq %4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
+
+ return ret;
+}
+
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ int n) {
+ BN_ULONG ret;
+ size_t i = 0;
+
+ if (n <= 0)
+ return 0;
+
+ asm volatile (
+ " subq %0,%0 \n" /* clear carry */
+ " jmp 1f \n"
+ ".p2align 4 \n"
+ "1: movq (%4,%2,8),%0 \n"
+ " adcq (%5,%2,8),%0 \n"
+ " movq %0,(%3,%2,8) \n"
+ " lea 1(%2),%2 \n"
+ " loop 1b \n"
+ " sbbq %0,%0 \n"
+ : "=&r"(ret), "+c"(n), "+r"(i)
+ : "r"(rp), "r"(ap), "r"(bp)
+ : "cc", "memory");
+
+ return ret & 1;
+}
+
+#ifndef SIMICS
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ int n) {
+ BN_ULONG ret;
+ size_t i = 0;
+
+ if (n <= 0)
+ return 0;
+
+ asm volatile (
+ " subq %0,%0 \n" /* clear borrow */
+ " jmp 1f \n"
+ ".p2align 4 \n"
+ "1: movq (%4,%2,8),%0 \n"
+ " sbbq (%5,%2,8),%0 \n"
+ " movq %0,(%3,%2,8) \n"
+ " lea 1(%2),%2 \n"
+ " loop 1b \n"
+ " sbbq %0,%0 \n"
+ : "=&r"(ret), "+c"(n), "+r"(i)
+ : "r"(rp), "r"(ap), "r"(bp)
+ : "cc", "memory");
+
+ return ret & 1;
+}
+#else
+/* Simics 1.4<7 has buggy sbbq:-( */
+#define BN_MASK2 0xffffffffffffffffL
+BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
+ BN_ULONG t1, t2;
+ int c = 0;
+
+ if (n <= 0)
+ return ((BN_ULONG)0);
+
+ for (;;) {
+ t1 = a[0];
+ t2 = b[0];
+ r[0] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ if (--n <= 0)
+ break;
+
+ t1 = a[1];
+ t2 = b[1];
+ r[1] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ if (--n <= 0)
+ break;
+
+ t1 = a[2];
+ t2 = b[2];
+ r[2] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ if (--n <= 0)
+ break;
+
+ t1 = a[3];
+ t2 = b[3];
+ r[3] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ if (--n <= 0)
+ break;
+
+ a += 4;
+ b += 4;
+ r += 4;
+ }
+ return (c);
+}
+#endif
+
+/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
+/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
+/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
+/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
+ */
+
+/* Keep in mind that carrying into high part of multiplication result can not
+ * overflow, because it cannot be all-ones. */
+#define mul_add_c(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG t1, t2; \
+ asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
+ asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0), "+r"(c1), "+r"(c2) \
+ : "r"(t1), "r"(t2), "g"(0) \
+ : "cc"); \
+ } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+ do { \
+ BN_ULONG t1, t2; \
+ asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \
+ asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0), "+r"(c1), "+r"(c2) \
+ : "r"(t1), "r"(t2), "g"(0) \
+ : "cc"); \
+ } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG t1, t2; \
+ asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
+ asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0), "+r"(c1), "+r"(c2) \
+ : "r"(t1), "r"(t2), "g"(0) \
+ : "cc"); \
+ asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
+ : "+r"(c0), "+r"(c1), "+r"(c2) \
+ : "r"(t1), "r"(t2), "g"(0) \
+ : "cc"); \
+ } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ mul_add_c(a[0], b[0], c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[1], c2, c3, c1);
+ mul_add_c(a[1], b[0], c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[0], c3, c1, c2);
+ mul_add_c(a[1], b[1], c3, c1, c2);
+ mul_add_c(a[0], b[2], c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ mul_add_c(a[0], b[3], c1, c2, c3);
+ mul_add_c(a[1], b[2], c1, c2, c3);
+ mul_add_c(a[2], b[1], c1, c2, c3);
+ mul_add_c(a[3], b[0], c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ mul_add_c(a[4], b[0], c2, c3, c1);
+ mul_add_c(a[3], b[1], c2, c3, c1);
+ mul_add_c(a[2], b[2], c2, c3, c1);
+ mul_add_c(a[1], b[3], c2, c3, c1);
+ mul_add_c(a[0], b[4], c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ mul_add_c(a[0], b[5], c3, c1, c2);
+ mul_add_c(a[1], b[4], c3, c1, c2);
+ mul_add_c(a[2], b[3], c3, c1, c2);
+ mul_add_c(a[3], b[2], c3, c1, c2);
+ mul_add_c(a[4], b[1], c3, c1, c2);
+ mul_add_c(a[5], b[0], c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ mul_add_c(a[6], b[0], c1, c2, c3);
+ mul_add_c(a[5], b[1], c1, c2, c3);
+ mul_add_c(a[4], b[2], c1, c2, c3);
+ mul_add_c(a[3], b[3], c1, c2, c3);
+ mul_add_c(a[2], b[4], c1, c2, c3);
+ mul_add_c(a[1], b[5], c1, c2, c3);
+ mul_add_c(a[0], b[6], c1, c2, c3);
+ r[6] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[7], c2, c3, c1);
+ mul_add_c(a[1], b[6], c2, c3, c1);
+ mul_add_c(a[2], b[5], c2, c3, c1);
+ mul_add_c(a[3], b[4], c2, c3, c1);
+ mul_add_c(a[4], b[3], c2, c3, c1);
+ mul_add_c(a[5], b[2], c2, c3, c1);
+ mul_add_c(a[6], b[1], c2, c3, c1);
+ mul_add_c(a[7], b[0], c2, c3, c1);
+ r[7] = c2;
+ c2 = 0;
+ mul_add_c(a[7], b[1], c3, c1, c2);
+ mul_add_c(a[6], b[2], c3, c1, c2);
+ mul_add_c(a[5], b[3], c3, c1, c2);
+ mul_add_c(a[4], b[4], c3, c1, c2);
+ mul_add_c(a[3], b[5], c3, c1, c2);
+ mul_add_c(a[2], b[6], c3, c1, c2);
+ mul_add_c(a[1], b[7], c3, c1, c2);
+ r[8] = c3;
+ c3 = 0;
+ mul_add_c(a[2], b[7], c1, c2, c3);
+ mul_add_c(a[3], b[6], c1, c2, c3);
+ mul_add_c(a[4], b[5], c1, c2, c3);
+ mul_add_c(a[5], b[4], c1, c2, c3);
+ mul_add_c(a[6], b[3], c1, c2, c3);
+ mul_add_c(a[7], b[2], c1, c2, c3);
+ r[9] = c1;
+ c1 = 0;
+ mul_add_c(a[7], b[3], c2, c3, c1);
+ mul_add_c(a[6], b[4], c2, c3, c1);
+ mul_add_c(a[5], b[5], c2, c3, c1);
+ mul_add_c(a[4], b[6], c2, c3, c1);
+ mul_add_c(a[3], b[7], c2, c3, c1);
+ r[10] = c2;
+ c2 = 0;
+ mul_add_c(a[4], b[7], c3, c1, c2);
+ mul_add_c(a[5], b[6], c3, c1, c2);
+ mul_add_c(a[6], b[5], c3, c1, c2);
+ mul_add_c(a[7], b[4], c3, c1, c2);
+ r[11] = c3;
+ c3 = 0;
+ mul_add_c(a[7], b[5], c1, c2, c3);
+ mul_add_c(a[6], b[6], c1, c2, c3);
+ mul_add_c(a[5], b[7], c1, c2, c3);
+ r[12] = c1;
+ c1 = 0;
+ mul_add_c(a[6], b[7], c2, c3, c1);
+ mul_add_c(a[7], b[6], c2, c3, c1);
+ r[13] = c2;
+ c2 = 0;
+ mul_add_c(a[7], b[7], c3, c1, c2);
+ r[14] = c3;
+ r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ mul_add_c(a[0], b[0], c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[1], c2, c3, c1);
+ mul_add_c(a[1], b[0], c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[0], c3, c1, c2);
+ mul_add_c(a[1], b[1], c3, c1, c2);
+ mul_add_c(a[0], b[2], c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ mul_add_c(a[0], b[3], c1, c2, c3);
+ mul_add_c(a[1], b[2], c1, c2, c3);
+ mul_add_c(a[2], b[1], c1, c2, c3);
+ mul_add_c(a[3], b[0], c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ mul_add_c(a[3], b[1], c2, c3, c1);
+ mul_add_c(a[2], b[2], c2, c3, c1);
+ mul_add_c(a[1], b[3], c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[3], c3, c1, c2);
+ mul_add_c(a[3], b[2], c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ mul_add_c(a[3], b[3], c1, c2, c3);
+ r[6] = c1;
+ r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ sqr_add_c(a, 0, c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 1, 0, c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ sqr_add_c(a, 1, c3, c1, c2);
+ sqr_add_c2(a, 2, 0, c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 3, 0, c1, c2, c3);
+ sqr_add_c2(a, 2, 1, c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ sqr_add_c(a, 2, c2, c3, c1);
+ sqr_add_c2(a, 3, 1, c2, c3, c1);
+ sqr_add_c2(a, 4, 0, c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 5, 0, c3, c1, c2);
+ sqr_add_c2(a, 4, 1, c3, c1, c2);
+ sqr_add_c2(a, 3, 2, c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ sqr_add_c(a, 3, c1, c2, c3);
+ sqr_add_c2(a, 4, 2, c1, c2, c3);
+ sqr_add_c2(a, 5, 1, c1, c2, c3);
+ sqr_add_c2(a, 6, 0, c1, c2, c3);
+ r[6] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 7, 0, c2, c3, c1);
+ sqr_add_c2(a, 6, 1, c2, c3, c1);
+ sqr_add_c2(a, 5, 2, c2, c3, c1);
+ sqr_add_c2(a, 4, 3, c2, c3, c1);
+ r[7] = c2;
+ c2 = 0;
+ sqr_add_c(a, 4, c3, c1, c2);
+ sqr_add_c2(a, 5, 3, c3, c1, c2);
+ sqr_add_c2(a, 6, 2, c3, c1, c2);
+ sqr_add_c2(a, 7, 1, c3, c1, c2);
+ r[8] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 7, 2, c1, c2, c3);
+ sqr_add_c2(a, 6, 3, c1, c2, c3);
+ sqr_add_c2(a, 5, 4, c1, c2, c3);
+ r[9] = c1;
+ c1 = 0;
+ sqr_add_c(a, 5, c2, c3, c1);
+ sqr_add_c2(a, 6, 4, c2, c3, c1);
+ sqr_add_c2(a, 7, 3, c2, c3, c1);
+ r[10] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 7, 4, c3, c1, c2);
+ sqr_add_c2(a, 6, 5, c3, c1, c2);
+ r[11] = c3;
+ c3 = 0;
+ sqr_add_c(a, 6, c1, c2, c3);
+ sqr_add_c2(a, 7, 5, c1, c2, c3);
+ r[12] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 7, 6, c2, c3, c1);
+ r[13] = c2;
+ c2 = 0;
+ sqr_add_c(a, 7, c3, c1, c2);
+ r[14] = c3;
+ r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ sqr_add_c(a, 0, c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 1, 0, c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ sqr_add_c(a, 1, c3, c1, c2);
+ sqr_add_c2(a, 2, 0, c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 3, 0, c1, c2, c3);
+ sqr_add_c2(a, 2, 1, c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ sqr_add_c(a, 2, c2, c3, c1);
+ sqr_add_c2(a, 3, 1, c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 3, 2, c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ sqr_add_c(a, 3, c1, c2, c3);
+ r[6] = c1;
+ r[7] = c2;
+}
+
+#endif /* defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS) */
diff --git a/src/crypto/bn/asm/x86_64-mont.pl b/src/crypto/bn/asm/x86_64-mont.pl
new file mode 100644
index 0000000..39476ab
--- /dev/null
+++ b/src/crypto/bn/asm/x86_64-mont.pl
@@ -0,0 +1,1401 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005.
+#
+# Montgomery multiplication routine for x86_64. While it gives modest
+# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
+# than twice, >2x, as fast. Most common rsa1024 sign is improved by
+# respectful 50%. It remains to be seen if loop unrolling and
+# dedicated squaring routine can provide further improvement...
+
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# June 2013.
+#
+# Optimize reduction in squaring procedure and improve 1024+-bit RSA
+# sign performance by 10-16% on Intel Sandy Bridge and later
+# (virtually same on non-Intel processors).
+
+# August 2013.
+#
+# Add MULX/ADOX/ADCX code path.
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $addx = ($1>=12);
+}
+
+# int bn_mul_mont(
+$rp="%rdi"; # BN_ULONG *rp,
+$ap="%rsi"; # const BN_ULONG *ap,
+$bp="%rdx"; # const BN_ULONG *bp,
+$np="%rcx"; # const BN_ULONG *np,
+$n0="%r8"; # const BN_ULONG *n0,
+$num="%r9"; # int num);
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl bn_mul_mont
+.type bn_mul_mont,\@function,6
+.align 16
+bn_mul_mont:
+ test \$3,${num}d
+ jnz .Lmul_enter
+ cmp \$8,${num}d
+ jb .Lmul_enter
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+___
+$code.=<<___;
+ cmp $ap,$bp
+ jne .Lmul4x_enter
+ test \$7,${num}d
+ jz .Lsqr8x_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov ${num}d,${num}d
+ lea 2($num),%r10
+ mov %rsp,%r11
+ neg %r10
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$lo0
+ mov ($np),%rax
+
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .L1st
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+ mov $lo0,$hi0
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ jmp .Louter
+.align 16
+.Louter:
+ mov ($bp,$i,8),$m0 # m0=bp[i]
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
+ mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .Linner
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ add $lo0,$hi1 # pull upmost overflow bit
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ cmp $num,$i
+ jb .Louter
+
+ xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ dec $j # doesn't affect CF!
+ jnz .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
+ mov $num,$j # j=num
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov (%rsp,$i,8),$ap
+ mov ($rp,$i,8),$np
+ xor $np,$ap # conditional select:
+ and %rax,$ap # ((ap ^ np) & %rax) ^ np
+ xor $np,$ap # ap = borrow?tp:rp
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov $ap,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont,\@function,6
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+___
+$code.=<<___ if ($addx);
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ je .Lmulx4x_enter
+___
+$code.=<<___;
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov ${num}d,${num}d
+ lea 4($num),%r10
+ mov %rsp,%r11
+ neg %r10
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
+ mov %rdx,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4($j),$j # j++
+ adc \$0,%rdx
+ mov $N[1],(%rsp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jb .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter4x:
+ mov ($bp,$i,8),$m0 # m0=bp[i]
+ xor $j,$j # j=0
+ mov (%rsp),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ imulq $A[0],$m1 # tp[0]*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ add 8(%rsp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4($j),$j # j+=2
+ adc \$0,%rdx
+ mov $N[1],(%rsp) # tp[j-1]
+ mov %rdx,$N[0]
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ add 8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jb .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 1($i),$i # i++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ cmp $num,$i
+ jb .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+ mov 16(%rsp,$num,8),$rp # restore $rp
+ mov 0(%rsp),@ri[0] # tp[0]
+ mov 8(%rsp),@ri[1] # tp[1]
+ shr \$2,$num # num/=4
+ lea (%rsp),$ap # borrow ap for tp
+ xor $i,$i # i=0 and clear CF!
+
+ sub 0($np),@ri[0]
+ mov 16($ap),@ri[2] # tp[2]
+ mov 24($ap),@ri[3] # tp[3]
+ sbb 8($np),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($np,$i,8),@ri[2]
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
+ mov 40($ap,$i,8),@ri[1]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($np,$i,8),@ri[0]
+ mov 48($ap,$i,8),@ri[2]
+ mov 56($ap,$i,8),@ri[3]
+ sbb 40($np,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub4x
+
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
+ sbb 16($np,$i,8),@ri[2]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[0],%xmm0
+ punpcklqdq %xmm0,%xmm0 # extend mask to 128 bits
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+
+ mov $num,$j
+ pxor %xmm5,%xmm5
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x: # copy or in-place refresh
+ movdqu (%rsp,$i),%xmm2
+ movdqu 16(%rsp,$i),%xmm4
+ movdqu ($rp,$i),%xmm1
+ movdqu 16($rp,$i),%xmm3
+ pxor %xmm1,%xmm2 # conditional select
+ pxor %xmm3,%xmm4
+ pand %xmm0,%xmm2
+ pand %xmm0,%xmm4
+ pxor %xmm1,%xmm2
+ pxor %xmm3,%xmm4
+ movdqu %xmm2,($rp,$i)
+ movdqu %xmm4,16($rp,$i)
+ movdqa %xmm5,(%rsp,$i) # zap temporary vectors
+ movdqa %xmm5,16(%rsp,$i)
+
+ lea 32($i),$i
+ dec $j
+ jnz .Lcopy4x
+
+ shl \$2,$num
+___
+}
+$code.=<<___;
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ ret
+.size bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+ {{{
+######################################################################
+# void bn_sqr8x_mont(
+my $rptr="%rdi"; # const BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # not used
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___ if ($addx);
+.extern bn_sqrx8x_internal # see x86_64-mont5 module
+___
+$code.=<<___;
+.extern bn_sqr8x_internal # see x86_64-mont5 module
+
+.type bn_sqr8x_mont,\@function,6
+.align 32
+bn_sqr8x_mont:
+.Lsqr8x_enter:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10 # 4*$num
+ neg $num
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr modulo
+ # 4096. this is done to allow memory disambiguation logic
+ # do its job.
+ #
+ lea -64(%rsp,$num,4),%r11
+ mov ($n0),$n0 # *n0
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lsqr8x_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ jmp .Lsqr8x_sp_done
+
+.align 32
+.Lsqr8x_sp_alt:
+ lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
+ lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lsqr8x_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ lea 64(%rsp,$num,2),%r11 # copy of modulus
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lsqr8x_body:
+
+ mov $num,$i
+ movq %r11, %xmm2 # save pointer to modulus copy
+ shr \$3+2,$i
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
+ jmp .Lsqr8x_copy_n
+
+.align 32
+.Lsqr8x_copy_n:
+ movq 8*0($nptr),%xmm0
+ movq 8*1($nptr),%xmm1
+ movq 8*2($nptr),%xmm3
+ movq 8*3($nptr),%xmm4
+ lea 8*4($nptr),$nptr
+ movdqa %xmm0,16*0(%r11)
+ movdqa %xmm1,16*1(%r11)
+ movdqa %xmm3,16*2(%r11)
+ movdqa %xmm4,16*3(%r11)
+ lea 16*4(%r11),%r11
+ dec $i
+ jnz .Lsqr8x_copy_n
+
+ pxor %xmm0,%xmm0
+ movq $rptr,%xmm1 # save $rptr
+ movq %r10, %xmm3 # -$num
+___
+$code.=<<___ if ($addx);
+ and \$0x80100,%eax
+ cmp \$0x80100,%eax
+ jne .Lsqr8x_nox
+
+ call bn_sqrx8x_internal # see x86_64-mont5 module
+
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ lea 64(%rsp,$num,2),%rdx
+ shr \$3+2,$num
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lsqr8x_zero
+
+.align 32
+.Lsqr8x_nox:
+___
+$code.=<<___;
+ call bn_sqr8x_internal # see x86_64-mont5 module
+
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ lea 64(%rsp,$num,2),%rdx
+ shr \$3+2,$num
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lsqr8x_zero
+
+.align 32
+.Lsqr8x_zero:
+ movdqa %xmm0,16*0(%rax) # wipe t
+ movdqa %xmm0,16*1(%rax)
+ movdqa %xmm0,16*2(%rax)
+ movdqa %xmm0,16*3(%rax)
+ lea 16*4(%rax),%rax
+ movdqa %xmm0,16*0(%rdx) # wipe n
+ movdqa %xmm0,16*1(%rdx)
+ movdqa %xmm0,16*2(%rdx)
+ movdqa %xmm0,16*3(%rdx)
+ lea 16*4(%rdx),%rdx
+ dec $num
+ jnz .Lsqr8x_zero
+
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lsqr8x_epilogue:
+ ret
+.size bn_sqr8x_mont,.-bn_sqr8x_mont
+___
+}}}
+
+if ($addx) {{{
+my $bp="%rdx"; # original value
+
+$code.=<<___;
+.type bn_mulx4x_mont,\@function,6
+.align 32
+bn_mulx4x_mont:
+.Lmulx4x_enter:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ shl \$3,${num}d # convert $num to bytes
+ .byte 0x67
+ xor %r10,%r10
+ sub $num,%r10 # -$num
+ mov ($n0),$n0 # *n0
+ lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
+ lea ($bp,$num),%r10
+ and \$-128,%rsp
+ ##############################################################
+ # Stack layout
+ # +0 num
+ # +8 off-loaded &b[i]
+ # +16 end of b[num]
+ # +24 saved n0
+ # +32 saved rp
+ # +40 saved %rsp
+ # +48 inner counter
+ # +56
+ # +64 tmp[num+1]
+ #
+ mov $num,0(%rsp) # save $num
+ shr \$5,$num
+ mov %r10,16(%rsp) # end of b[num]
+ sub \$1,$num
+ mov $n0, 24(%rsp) # save *n0
+ mov $rp, 32(%rsp) # save $rp
+ mov %rax,40(%rsp) # save original %rsp
+ mov $num,48(%rsp) # inner counter
+ jmp .Lmulx4x_body
+
+.align 32
+.Lmulx4x_body:
+___
+my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
+ ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
+$code.=<<___;
+ lea 8($bp),$bptr
+ mov ($bp),%rdx # b[0], $bp==%rdx actually
+ lea 64+32(%rsp),$tptr
+ mov %rdx,$bi
+
+ mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
+ mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
+ add %rax,%r11
+ mov $bptr,8(%rsp) # off-load &b[i]
+ mulx 2*8($aptr),%r12,%r13 # ...
+ adc %r14,%r12
+ adc \$0,%r13
+
+ mov $mi,$bptr # borrow $bptr
+ imulq 24(%rsp),$mi # "t[0]"*n0
+ xor $zero,$zero # cf=0, of=0
+
+ mulx 3*8($aptr),%rax,%r14
+ mov $mi,%rdx
+ lea 4*8($aptr),$aptr
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+
+ mulx 0*8($nptr),%rax,%r10
+ adcx %rax,$bptr # discarded
+ adox %r11,%r10
+ mulx 1*8($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
+ mov 48(%rsp),$bptr # counter value
+ mov %r10,-4*8($tptr)
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r11,-3*8($tptr)
+ adcx %rax,%r12
+ adox $zero,%r15 # of=0
+ lea 4*8($nptr),$nptr
+ mov %r12,-2*8($tptr)
+
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
+ adcx %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
+ adcx %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ .byte 0x67,0x67
+ mov $mi,%rdx
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+
+ adox %r15,%r10
+ mulx 0*8($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*8($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*8($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ mov %r11,-4*8($tptr)
+ adox %r15,%r13
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ lea 4*8($nptr),$nptr
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_1st
+
+ mov 0(%rsp),$num # load num
+ mov 8(%rsp),$bptr # re-load &b[i]
+ adc $zero,%r15 # modulo-scheduled
+ add %r15,%r14
+ sbb %r15,%r15 # top-most carry
+ mov %r14,-1*8($tptr)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ mov ($bptr),%rdx # b[i]
+ lea 8($bptr),$bptr # b++
+ sub $num,$aptr # rewind $aptr
+ mov %r15,($tptr) # save top-most carry
+ lea 64+4*8(%rsp),$tptr
+ sub $num,$nptr # rewind $nptr
+
+ mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
+ xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
+ mov %rdx,$bi
+ mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
+ adox -4*8($tptr),$mi
+ adcx %r14,%r11
+ mulx 2*8($aptr),%r15,%r13 # ...
+ adox -3*8($tptr),%r11
+ adcx %r15,%r12
+ adox $zero,%r12
+ adcx $zero,%r13
+
+ mov $bptr,8(%rsp) # off-load &b[i]
+ .byte 0x67
+ mov $mi,%r15
+ imulq 24(%rsp),$mi # "t[0]"*n0
+ xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
+
+ mulx 3*8($aptr),%rax,%r14
+ mov $mi,%rdx
+ adox -2*8($tptr),%r12
+ adcx %rax,%r13
+ adox -1*8($tptr),%r13
+ adcx $zero,%r14
+ lea 4*8($aptr),$aptr
+ adox $zero,%r14
+
+ mulx 0*8($nptr),%rax,%r10
+ adcx %rax,%r15 # discarded
+ adox %r11,%r10
+ mulx 1*8($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ mulx 2*8($nptr),%rax,%r12
+ mov %r10,-4*8($tptr)
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r11,-3*8($tptr)
+ lea 4*8($nptr),$nptr
+ adcx %rax,%r12
+ adox $zero,%r15 # of=0
+ mov 48(%rsp),$bptr # counter value
+ mov %r12,-2*8($tptr)
+
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ adox %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
+ adcx 0*8($tptr),%r10
+ adox %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx 1*8($tptr),%r11
+ adox %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ mov $mi,%rdx
+ adcx 2*8($tptr),%r12
+ adox %rax,%r13
+ adcx 3*8($tptr),%r13
+ adox $zero,%r14 # of=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+ adcx $zero,%r14 # cf=0
+
+ adox %r15,%r10
+ mulx 0*8($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*8($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*8($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ adox %r15,%r13
+ mulx 3*8($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r11,-4*8($tptr)
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ lea 4*8($nptr),$nptr
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_inner
+
+ mov 0(%rsp),$num # load num
+ mov 8(%rsp),$bptr # re-load &b[i]
+ adc $zero,%r15 # modulo-scheduled
+ sub 0*8($tptr),$zero # pull top-most carry
+ adc %r15,%r14
+ mov -8($nptr),$mi
+ sbb %r15,%r15 # top-most carry
+ mov %r14,-1*8($tptr)
+
+ cmp 16(%rsp),$bptr
+ jne .Lmulx4x_outer
+
+ sub %r14,$mi # compare top-most words
+ sbb $mi,$mi
+ or $mi,%r15
+
+ neg $num
+ xor %rdx,%rdx
+ mov 32(%rsp),$rptr # restore rp
+ lea 64(%rsp),$tptr
+
+ pxor %xmm0,%xmm0
+ mov 0*8($nptr,$num),%r8
+ mov 1*8($nptr,$num),%r9
+ neg %r8
+ jmp .Lmulx4x_sub_entry
+
+.align 32
+.Lmulx4x_sub:
+ mov 0*8($nptr,$num),%r8
+ mov 1*8($nptr,$num),%r9
+ not %r8
+.Lmulx4x_sub_entry:
+ mov 2*8($nptr,$num),%r10
+ not %r9
+ and %r15,%r8
+ mov 3*8($nptr,$num),%r11
+ not %r10
+ and %r15,%r9
+ not %r11
+ and %r15,%r10
+ and %r15,%r11
+
+ neg %rdx # mov %rdx,%cf
+ adc 0*8($tptr),%r8
+ adc 1*8($tptr),%r9
+ movdqa %xmm0,($tptr)
+ adc 2*8($tptr),%r10
+ adc 3*8($tptr),%r11
+ movdqa %xmm0,16($tptr)
+ lea 4*8($tptr),$tptr
+ sbb %rdx,%rdx # mov %cf,%rdx
+
+ mov %r8,0*8($rptr)
+ mov %r9,1*8($rptr)
+ mov %r10,2*8($rptr)
+ mov %r11,3*8($rptr)
+ lea 4*8($rptr),$rptr
+
+ add \$32,$num
+ jnz .Lmulx4x_sub
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmulx4x_epilogue:
+ ret
+.size bn_mulx4x_mont,.-bn_mulx4x_mont
+___
+}}}
+$code.=<<___;
+.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type mul_handler,\@abi-omnipotent
+.align 16
+mul_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 192($context),%r10 # pull $num
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+ jmp .Lcommon_seh_tail
+.size mul_handler,.-mul_handler
+
+.type sqr_handler,\@abi-omnipotent
+.align 16
+sqr_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<.Lsqr_body
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
+ jae .Lcommon_seh_tail
+
+ mov 40(%rax),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size sqr_handler,.-sqr_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_bn_mul_mont
+ .rva .LSEH_end_bn_mul_mont
+ .rva .LSEH_info_bn_mul_mont
+
+ .rva .LSEH_begin_bn_mul4x_mont
+ .rva .LSEH_end_bn_mul4x_mont
+ .rva .LSEH_info_bn_mul4x_mont
+
+ .rva .LSEH_begin_bn_sqr8x_mont
+ .rva .LSEH_end_bn_sqr8x_mont
+ .rva .LSEH_info_bn_sqr8x_mont
+___
+$code.=<<___ if ($addx);
+ .rva .LSEH_begin_bn_mulx4x_mont
+ .rva .LSEH_end_bn_mulx4x_mont
+ .rva .LSEH_info_bn_mulx4x_mont
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_bn_mul_mont:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.LSEH_info_bn_sqr8x_mont:
+ .byte 9,0,0,0
+ .rva sqr_handler
+ .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+___
+$code.=<<___ if ($addx);
+.LSEH_info_bn_mulx4x_mont:
+ .byte 9,0,0,0
+ .rva sqr_handler
+ .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/src/crypto/bn/asm/x86_64-mont5.pl b/src/crypto/bn/asm/x86_64-mont5.pl
new file mode 100644
index 0000000..80e9126
--- /dev/null
+++ b/src/crypto/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,3499 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+# August 2013.
+#
+# Add MULX/AD*X code paths and additional interfaces to optimize for
+# branch prediction unit. For input lengths that are multiples of 8
+# the np argument is not just modulus value, but one interleaved
+# with 0. This is to optimize post-condition...
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $addx = ($1>=12);
+}
+
+# int bn_mul_mont_gather5(
+$rp="%rdi"; # BN_ULONG *rp,
+$ap="%rsi"; # const BN_ULONG *ap,
+$bp="%rdx"; # const BN_ULONG *bp,
+$np="%rcx"; # const BN_ULONG *np,
+$n0="%r8"; # const BN_ULONG *n0,
+$num="%r9"; # int num,
+ # int idx); # 0 to 2^5-1, "index" in $bp holding
+ # pre-computed powers of a', interlaced
+ # in such manner that b[0] is $bp[idx],
+ # b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl bn_mul_mont_gather5
+.type bn_mul_mont_gather5,\@function,6
+.align 64
+bn_mul_mont_gather5:
+ test \$7,${num}d
+ jnz .Lmul_enter
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+___
+$code.=<<___;
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ mov ${num}d,${num}d
+ mov %rsp,%rax
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ lea 2($num),%r11
+ neg %r11
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ movq %xmm0,$m0 # m0=bp[0]
+
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$lo0
+ mov ($np),%rax
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .L1st
+
+ movq %xmm0,$m0 # bp[1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+ mov $lo0,$hi0
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ jmp .Louter
+.align 16
+.Louter:
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
+ mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .Linner
+
+ movq %xmm0,$m0 # bp[i+1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ add $lo0,$hi1 # pull upmost overflow bit
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ cmp $num,$i
+ jb .Louter
+
+ xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
+ mov $num,$j # j=num
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov (%rsp,$i,8),$ap
+ mov ($rp,$i,8),$np
+ xor $np,$ap # conditional select:
+ and %rax,$ap # ((ap ^ np) & %rax) ^ np
+ xor $np,$ap # ap = borrow?tp:rp
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov $ap,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont_gather5,\@function,6
+.align 32
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+___
+$code.=<<___ if ($addx);
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ je .Lmulx4x_enter
+___
+$code.=<<___;
+ .byte 0x67
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d
+ shl \$3+2,%r10d # 4*$num
+ neg $num # -$num
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic. [excessive frame is allocated in order
+ # to allow bn_from_mont8x to clear it.]
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $ap,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lmul4xsp_alt
+ sub %r11,%rsp # align with $ap
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ jmp .Lmul4xsp_done
+
+.align 32
+.Lmul4xsp_alt:
+ lea 4096-64(,$num,2),%r10
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lmul4xsp_done:
+ and \$-64,%rsp
+ neg $num
+
+ mov %rax,40(%rsp)
+.Lmul4x_body:
+
+ call mul4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmul4x_epilogue:
+ ret
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type mul4x_internal,\@abi-omnipotent
+.align 32
+mul4x_internal:
+ shl \$5,$num
+ mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ lea 256(%rdx,$num),%r13
+ shr \$5,$num # restore $num
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+ $tp=$i;
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ add \$7,%r11
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+ and \$7,%r11
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ lea $STRIDE($bp),$tp # borrow $tp
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ .byte 0x67
+ por %xmm1,%xmm0
+ movq `0*$STRIDE/4-96`($tp),%xmm1
+ .byte 0x67
+ pand %xmm7,%xmm3
+ .byte 0x67
+ por %xmm2,%xmm0
+ movq `1*$STRIDE/4-96`($tp),%xmm2
+ .byte 0x67
+ pand %xmm4,%xmm1
+ .byte 0x67
+ por %xmm3,%xmm0
+ movq `2*$STRIDE/4-96`($tp),%xmm3
+
+ movq %xmm0,$m0 # m0=bp[0]
+ movq `3*$STRIDE/4-96`($tp),%xmm0
+ mov %r13,16+8(%rsp) # save end of b[num]
+ mov $rp, 56+8(%rsp) # save $rp
+
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+ lea ($ap,$num),$ap # end of a[num]
+ neg $num
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ pand %xmm5,%xmm2
+ pand %xmm6,%xmm3
+ por %xmm2,%xmm1
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ ##############################################################
+ # $tp is chosen so that writing to top-most element of the
+ # vector occurs just "above" references to powers table,
+ # "above" modulo cache-line size, which effectively precludes
+ # possibility of memory disambiguation logic failure when
+ # accessing the table.
+ #
+ lea 64+8(%rsp,%r11,8),$tp
+ mov %rdx,$A[1]
+
+ pand %xmm7,%xmm0
+ por %xmm3,%xmm1
+ lea 2*$STRIDE($bp),$bp
+ por %xmm1,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap,$num),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap,$num),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4*8($num),$j # j=4
+ lea 16*4($np),$np
+ adc \$0,%rdx
+ mov $N[1],($tp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+
+.align 32
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16*2($np),%rax
+ lea 32($tp),$tp
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24($tp) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -16*1($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16($tp) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov 16*0($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8($tp) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 16*1($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ lea 16*4($np),$np
+ adc \$0,%rdx
+ mov $N[1],($tp) # tp[j-1]
+ mov %rdx,$N[0]
+
+ add \$32,$j # j+=4
+ jnz .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16*2($np),%rax
+ lea 32($tp),$tp
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24($tp) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -16*1($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$num),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16($tp) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[1]
+ lea ($np,$num,2),$np # rewind $np
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8($tp)
+
+ jmp .Louter4x
+
+.align 32
+.Louter4x:
+ mov ($tp,$num),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+
+ imulq $A[0],$m1 # tp[0]*n0
+ .byte 0x67
+ mov %rdx,$A[1]
+ mov $N[1],($tp) # store upmost overflow bit
+
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea ($tp,$num),$tp # rewind $tp
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap,$num),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
+ adc \$0,%rdx
+ add 8($tp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap,$num),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4*8($num),$j # j=4
+ lea 16*4($np),$np
+ adc \$0,%rdx
+ mov %rdx,$N[0]
+ jmp .Linner4x
+
+.align 32
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16*2($np),%rax
+ adc \$0,%rdx
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
+ lea 32($tp),$tp
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32($tp) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -16*1($np),%rax
+ adc \$0,%rdx
+ add -8($tp),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24($tp) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov 16*0($np),%rax
+ adc \$0,%rdx
+ add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-16($tp) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 16*1($np),%rax
+ adc \$0,%rdx
+ add 8($tp),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap,$j),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 16*4($np),$np
+ adc \$0,%rdx
+ mov $N[0],-8($tp) # tp[j-1]
+ mov %rdx,$N[0]
+
+ add \$32,$j # j+=4
+ jnz .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16*2($np),%rax
+ adc \$0,%rdx
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
+ lea 32($tp),$tp
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32($tp) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov $m1,%rax
+ mov -16*1($np),$m1
+ adc \$0,%rdx
+ add -8($tp),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$num),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24($tp) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[i+1]
+ mov $N[1],-16($tp) # tp[j-1]
+ lea ($np,$num,2),$np # rewind $np
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add ($tp),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1] # upmost overflow bit
+ mov $N[0],-8($tp)
+
+ cmp 16+8(%rsp),$bp
+ jb .Louter4x
+___
+if (1) {
+$code.=<<___;
+ sub $N[0],$m1 # compare top-most words
+ adc $j,$j # $j is zero
+ or $j,$N[1]
+ xor \$1,$N[1]
+ lea ($tp,$num),%rbx # tptr in .sqr4x_sub
+ lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
+ mov %r9,%rcx
+ sar \$3+2,%rcx # cf=0
+ mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
+ jmp .Lsqr4x_sub
+___
+} else {
+my @ri=("%rax",$bp,$m0,$m1);
+my $rp="%rdx";
+$code.=<<___
+ xor \$1,$N[1]
+ lea ($tp,$num),$tp # rewind $tp
+ sar \$5,$num # cf=0
+ lea ($np,$N[1],8),$np
+ mov 56+8(%rsp),$rp # restore $rp
+ jmp .Lsub4x
+
+.align 32
+.Lsub4x:
+ .byte 0x66
+ mov 8*0($tp),@ri[0]
+ mov 8*1($tp),@ri[1]
+ .byte 0x66
+ sbb 16*0($np),@ri[0]
+ mov 8*2($tp),@ri[2]
+ sbb 16*1($np),@ri[1]
+ mov 3*8($tp),@ri[3]
+ lea 4*8($tp),$tp
+ sbb 16*2($np),@ri[2]
+ mov @ri[0],8*0($rp)
+ sbb 16*3($np),@ri[3]
+ lea 16*4($np),$np
+ mov @ri[1],8*1($rp)
+ mov @ri[2],8*2($rp)
+ mov @ri[3],8*3($rp)
+ lea 8*4($rp),$rp
+
+ inc $num
+ jnz .Lsub4x
+
+ ret
+___
+}
+$code.=<<___;
+.size mul4x_internal,.-mul4x_internal
+___
+}}}
+ {{{
+######################################################################
+# void bn_power5(
+my $rptr="%rdi"; # BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # const void *table,
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 8
+ # int pwr
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.globl bn_power5
+.type bn_power5,\@function,6
+.align 32
+bn_power5:
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ je .Lpowerx5_enter
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic.
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lpwr_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ jmp .Lpwr_sp_done
+
+.align 32
+.Lpwr_sp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lpwr_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved *n0
+ # +40 saved %rsp
+ # +48 t[2*$num]
+ #
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lpower5_body:
+ movq $rptr,%xmm1 # save $rptr
+ movq $nptr,%xmm2 # save $nptr
+ movq %r10, %xmm3 # -$num
+ movq $bptr,%xmm4
+
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+
+ movq %xmm2,$nptr
+ movq %xmm4,$bptr
+ mov $aptr,$rptr
+ mov 40(%rsp),%rax
+ lea 32(%rsp),$n0
+
+ call mul4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lpower5_epilogue:
+ ret
+.size bn_power5,.-bn_power5
+
+.globl bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.type bn_sqr8x_internal,\@abi-omnipotent
+.align 32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+ ##############################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ ##############################################################
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[2]a[1]
+ # a[4]a[0]
+ # a[3]a[1]
+ # a[5]a[0]
+ # a[4]a[1]
+ # a[3]a[2]
+ # a[6]a[0]
+ # a[5]a[1]
+ # a[4]a[2]
+ # a[7]a[0]
+ # a[6]a[1]
+ # a[5]a[2]
+ # a[4]a[3]
+ # a[7]a[1]
+ # a[6]a[2]
+ # a[5]a[3]
+ # a[7]a[2]
+ # a[6]a[3]
+ # a[5]a[4]
+ # a[7]a[3]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[4]a[0]
+ # a[5]a[0]
+ # a[6]a[0]
+ # a[7]a[0]
+ # a[2]a[1]
+ # a[3]a[1]
+ # a[4]a[1]
+ # a[5]a[1]
+ # a[6]a[1]
+ # a[7]a[1]
+ # a[3]a[2]
+ # a[4]a[2]
+ # a[5]a[2]
+ # a[6]a[2]
+ # a[7]a[2]
+ # a[4]a[3]
+ # a[5]a[3]
+ # a[6]a[3]
+ # a[7]a[3]
+ # a[5]a[4]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[0]a[0]
+ # a[1]a[1]
+ # a[2]a[2]
+ # a[3]a[3]
+ # a[4]a[4]
+ # a[5]a[5]
+ # a[6]a[6]
+ # a[7]a[7]
+
+ lea 32(%r10),$i # $i=-($num-32)
+ lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
+
+ mov $num,$j # $j=$num
+
+ # comments apply to $num==8 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov %rax,$A0[0] # a[1]*a[0]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc \$0,%rdx
+ mov $A0[1],-16($tptr,$i) # t[2]
+ mov %rdx,$A0[0]
+
+
+ mov -8($aptr,$i),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ mov %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+
+ lea ($i),$j
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[3]
+ jmp .Lsqr4x_1st
+
+.align 32
+.Lsqr4x_1st:
+ mov ($aptr,$j),$ai # a[4]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ mov 8($aptr,$j),$ai # a[5]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ mov $A0[1],($tptr,$j) # t[4]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ mov 16($aptr,$j),$ai # a[6]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1] # a[5]*a[3]+t[6]
+ mov $ai,%rax
+ mov $A0[0],8($tptr,$j) # t[5]
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+
+ mul $a0 # a[6]*a[2]
+ add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
+ mov $ai,%rax # a[3]
+ mov 24($aptr,$j),$ai # a[7]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+
+ mul $a1 # a[6]*a[5]
+ add %rax,$A1[0] # a[6]*a[5]+t[7]
+ mov $ai,%rax
+ mov $A0[1],16($tptr,$j) # t[6]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+ lea 32($j),$j
+
+ mul $a0 # a[7]*a[4]
+ add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[7]
+
+ cmp \$0,$j
+ jne .Lsqr4x_1st
+
+ mul $a1 # a[7]*a[5]
+ add %rax,$A1[1]
+ lea 16($i),$i
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[8]
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[9]
+ jmp .Lsqr4x_outer
+
+.align 32
+.Lsqr4x_outer: # comments apply to $num==6 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov -24($tptr,$i),$A0[0] # t[1]
+ add %rax,$A0[0] # a[1]*a[0]+t[1]
+ mov $ai,%rax # a[2]
+ adc \$0,%rdx
+ mov $A0[0],-24($tptr,$i) # t[1]
+ mov %rdx,$A0[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ xor $A1[0],$A1[0]
+
+ mov -8($aptr,$i),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add -8($tptr,$i),$A1[0]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add $A1[0],$A0[0]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$i) # t[3]
+
+ lea ($i),$j
+ jmp .Lsqr4x_inner
+
+.align 32
+.Lsqr4x_inner:
+ mov ($aptr,$j),$ai # a[4]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+ add ($tptr,$j),$A1[1]
+ adc \$0,$A1[0]
+
+ .byte 0x67
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ mov 8($aptr,$j),$ai # a[5]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $A0[1],($tptr,$j) # t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+ add 8($tptr,$j),$A1[0]
+ lea 16($j),$j # j++
+ adc \$0,$A1[1]
+
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add $A1[0],$A0[0]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
+
+ cmp \$0,$j
+ jne .Lsqr4x_inner
+
+ .byte 0x67
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1]
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
+
+ add \$16,$i
+ jnz .Lsqr4x_outer
+
+ # comments apply to $num==4 case
+ mov -32($aptr),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ mov $A0[0],-24($tptr) # t[1]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
+ mov -8($aptr),$ai # a[3]
+ adc \$0,$A0[0]
+
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
+ mov $ai,%rax
+ mov $A0[1],-16($tptr) # t[2]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr) # t[3]
+
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1]
+ mov -16($aptr),%rax # a[2]
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[4]
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[5]
+
+ mul $ai # a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+ add \$16,$i
+ xor $shift,$shift
+ sub $num,$i # $i=16-$num
+ xor $carry,$carry
+
+ add $A1[0],%rax # t[5]
+ adc \$0,%rdx
+ mov %rax,8($tptr) # t[5]
+ mov %rdx,16($tptr) # t[6]
+ mov $carry,24($tptr) # t[7]
+
+ mov -16($aptr,$i),%rax # a[0]
+ lea 48+8(%rsp),$tptr
+ xor $A0[0],$A0[0] # t[0]
+ mov 8($tptr),$A0[1] # t[1]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],8($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],16($tptr)
+ adc %rdx,$S[3]
+ lea 16($i),$i
+ mov $S[3],24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ lea 64($tptr),$tptr
+ jmp .Lsqr4x_shift_n_add
+
+.align 32
+.Lsqr4x_shift_n_add:
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr)
+ adc %rdx,$S[3]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ mov $S[3],-8($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov 8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],0($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],8($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 16($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],16($tptr)
+ adc %rdx,$S[3]
+ mov $S[3],24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ lea 64($tptr),$tptr
+ add \$32,$i
+ jnz .Lsqr4x_shift_n_add
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ .byte 0x67
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+ mov $S[1],-24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ adc %rax,$S[2]
+ adc %rdx,$S[3]
+ mov $S[2],-16($tptr)
+ mov $S[3],-8($tptr)
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
+
+$code.=<<___;
+ movq %xmm2,$nptr
+sqr8x_reduction:
+ xor %rax,%rax
+ lea ($nptr,$num,2),%rcx # end of n[]
+ lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
+ mov %rcx,0+8(%rsp)
+ lea 48+8(%rsp,$num),$tptr # end of initial t[] window
+ mov %rdx,8+8(%rsp)
+ neg $num
+ jmp .L8x_reduction_loop
+
+.align 32
+.L8x_reduction_loop:
+ lea ($tptr,$num),$tptr # start of current t[] window
+ .byte 0x66
+ mov 8*0($tptr),$m0
+ mov 8*1($tptr),%r9
+ mov 8*2($tptr),%r10
+ mov 8*3($tptr),%r11
+ mov 8*4($tptr),%r12
+ mov 8*5($tptr),%r13
+ mov 8*6($tptr),%r14
+ mov 8*7($tptr),%r15
+ mov %rax,(%rdx) # store top-most carry bit
+ lea 8*8($tptr),$tptr
+
+ .byte 0x67
+ mov $m0,%r8
+ imulq 32+8(%rsp),$m0 # n0*a[0]
+ mov 16*0($nptr),%rax # n[0]
+ mov \$8,%ecx
+ jmp .L8x_reduce
+
+.align 32
+.L8x_reduce:
+ mulq $m0
+ mov 16*1($nptr),%rax # n[1]
+ neg %r8
+ mov %rdx,%r8
+ adc \$0,%r8
+
+ mulq $m0
+ add %rax,%r9
+ mov 16*2($nptr),%rax
+ adc \$0,%rdx
+ add %r9,%r8
+ mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
+ mov %rdx,%r9
+ adc \$0,%r9
+
+ mulq $m0
+ add %rax,%r10
+ mov 16*3($nptr),%rax
+ adc \$0,%rdx
+ add %r10,%r9
+ mov 32+8(%rsp),$carry # pull n0, borrow $carry
+ mov %rdx,%r10
+ adc \$0,%r10
+
+ mulq $m0
+ add %rax,%r11
+ mov 16*4($nptr),%rax
+ adc \$0,%rdx
+ imulq %r8,$carry # modulo-scheduled
+ add %r11,%r10
+ mov %rdx,%r11
+ adc \$0,%r11
+
+ mulq $m0
+ add %rax,%r12
+ mov 16*5($nptr),%rax
+ adc \$0,%rdx
+ add %r12,%r11
+ mov %rdx,%r12
+ adc \$0,%r12
+
+ mulq $m0
+ add %rax,%r13
+ mov 16*6($nptr),%rax
+ adc \$0,%rdx
+ add %r13,%r12
+ mov %rdx,%r13
+ adc \$0,%r13
+
+ mulq $m0
+ add %rax,%r14
+ mov 16*7($nptr),%rax
+ adc \$0,%rdx
+ add %r14,%r13
+ mov %rdx,%r14
+ adc \$0,%r14
+
+ mulq $m0
+ mov $carry,$m0 # n0*a[i]
+ add %rax,%r15
+ mov 16*0($nptr),%rax # n[0]
+ adc \$0,%rdx
+ add %r15,%r14
+ mov %rdx,%r15
+ adc \$0,%r15
+
+ dec %ecx
+ jnz .L8x_reduce
+
+ lea 16*8($nptr),$nptr
+ xor %rax,%rax
+ mov 8+8(%rsp),%rdx # pull end of t[]
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .L8x_no_tail
+
+ .byte 0x66
+ add 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ sbb $carry,$carry # top carry
+
+ mov 48+56+8(%rsp),$m0 # pull n0*a[0]
+ mov \$8,%ecx
+ mov 16*0($nptr),%rax
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail:
+ mulq $m0
+ add %rax,%r8
+ mov 16*1($nptr),%rax
+ mov %r8,($tptr) # save result
+ mov %rdx,%r8
+ adc \$0,%r8
+
+ mulq $m0
+ add %rax,%r9
+ mov 16*2($nptr),%rax
+ adc \$0,%rdx
+ add %r9,%r8
+ lea 8($tptr),$tptr # $tptr++
+ mov %rdx,%r9
+ adc \$0,%r9
+
+ mulq $m0
+ add %rax,%r10
+ mov 16*3($nptr),%rax
+ adc \$0,%rdx
+ add %r10,%r9
+ mov %rdx,%r10
+ adc \$0,%r10
+
+ mulq $m0
+ add %rax,%r11
+ mov 16*4($nptr),%rax
+ adc \$0,%rdx
+ add %r11,%r10
+ mov %rdx,%r11
+ adc \$0,%r11
+
+ mulq $m0
+ add %rax,%r12
+ mov 16*5($nptr),%rax
+ adc \$0,%rdx
+ add %r12,%r11
+ mov %rdx,%r12
+ adc \$0,%r12
+
+ mulq $m0
+ add %rax,%r13
+ mov 16*6($nptr),%rax
+ adc \$0,%rdx
+ add %r13,%r12
+ mov %rdx,%r13
+ adc \$0,%r13
+
+ mulq $m0
+ add %rax,%r14
+ mov 16*7($nptr),%rax
+ adc \$0,%rdx
+ add %r14,%r13
+ mov %rdx,%r14
+ adc \$0,%r14
+
+ mulq $m0
+ mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
+ add %rax,%r15
+ adc \$0,%rdx
+ add %r15,%r14
+ mov 16*0($nptr),%rax # pull n[0]
+ mov %rdx,%r15
+ adc \$0,%r15
+
+ dec %ecx
+ jnz .L8x_tail
+
+ lea 16*8($nptr),$nptr
+ mov 8+8(%rsp),%rdx # pull end of t[]
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .L8x_tail_done # break out of loop
+
+ mov 48+56+8(%rsp),$m0 # pull n0*a[0]
+ neg $carry
+ mov 8*0($nptr),%rax # pull n[0]
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ sbb $carry,$carry # top carry
+
+ mov \$8,%ecx
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail_done:
+ add (%rdx),%r8 # can this overflow?
+ xor %rax,%rax
+
+ neg $carry
+.L8x_no_tail:
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ adc \$0,%rax # top-most carry
+ mov -16($nptr),%rcx # np[num-1]
+ xor $carry,$carry
+
+ movq %xmm2,$nptr # restore $nptr
+
+ mov %r8,8*0($tptr) # store top 512 bits
+ mov %r9,8*1($tptr)
+ movq %xmm3,$num # $num is %r9, can't be moved upwards
+ mov %r10,8*2($tptr)
+ mov %r11,8*3($tptr)
+ mov %r12,8*4($tptr)
+ mov %r13,8*5($tptr)
+ mov %r14,8*6($tptr)
+ mov %r15,8*7($tptr)
+ lea 8*8($tptr),$tptr
+
+ cmp %rdx,$tptr # end of t[]?
+ jb .L8x_reduction_loop
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($tptr,$nptr)=("%rbx","%rbp");
+$code.=<<___;
+ #xor %rsi,%rsi # %rsi was $carry above
+ sub %r15,%rcx # compare top-most words
+ lea (%rdi,$num),$tptr # %rdi was $tptr above
+ adc %rsi,%rsi
+ mov $num,%rcx
+ or %rsi,%rax
+ movq %xmm1,$rptr # restore $rptr
+ xor \$1,%rax
+ movq %xmm1,$aptr # prepare for back-to-back call
+ lea ($nptr,%rax,8),$nptr
+ sar \$3+2,%rcx # cf=0
+ jmp .Lsqr4x_sub
+
+.align 32
+.Lsqr4x_sub:
+ .byte 0x66
+ mov 8*0($tptr),%r12
+ mov 8*1($tptr),%r13
+ sbb 16*0($nptr),%r12
+ mov 8*2($tptr),%r14
+ sbb 16*1($nptr),%r13
+ mov 8*3($tptr),%r15
+ lea 8*4($tptr),$tptr
+ sbb 16*2($nptr),%r14
+ mov %r12,8*0($rptr)
+ sbb 16*3($nptr),%r15
+ lea 16*4($nptr),$nptr
+ mov %r13,8*1($rptr)
+ mov %r14,8*2($rptr)
+ mov %r15,8*3($rptr)
+ lea 8*4($rptr),$rptr
+
+ inc %rcx # pass %cf
+ jnz .Lsqr4x_sub
+___
+}
+$code.=<<___;
+ mov $num,%r10 # prepare for back-to-back call
+ neg $num # restore $num
+ ret
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+___
+{
+$code.=<<___;
+.globl bn_from_montgomery
+.type bn_from_montgomery,\@abi-omnipotent
+.align 32
+bn_from_montgomery:
+ testl \$7,`($win64?"48(%rsp)":"%r9d")`
+ jz bn_from_mont8x
+ xor %eax,%eax
+ ret
+.size bn_from_montgomery,.-bn_from_montgomery
+
+.type bn_from_mont8x,\@function,6
+.align 32
+bn_from_mont8x:
+ .byte 0x67
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic.
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lfrom_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ jmp .Lfrom_sp_done
+
+.align 32
+.Lfrom_sp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lfrom_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved *n0
+ # +40 saved %rsp
+ # +48 t[2*$num]
+ #
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lfrom_body:
+ mov $num,%r11
+ lea 48(%rsp),%rax
+ pxor %xmm0,%xmm0
+ jmp .Lmul_by_1
+
+.align 32
+.Lmul_by_1:
+ movdqu ($aptr),%xmm1
+ movdqu 16($aptr),%xmm2
+ movdqu 32($aptr),%xmm3
+ movdqa %xmm0,(%rax,$num)
+ movdqu 48($aptr),%xmm4
+ movdqa %xmm0,16(%rax,$num)
+ .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
+ movdqa %xmm1,(%rax)
+ movdqa %xmm0,32(%rax,$num)
+ movdqa %xmm2,16(%rax)
+ movdqa %xmm0,48(%rax,$num)
+ movdqa %xmm3,32(%rax)
+ movdqa %xmm4,48(%rax)
+ lea 64(%rax),%rax
+ sub \$64,%r11
+ jnz .Lmul_by_1
+
+ movq $rptr,%xmm1
+ movq $nptr,%xmm2
+ .byte 0x67
+ mov $nptr,%rbp
+ movq %r10, %xmm3 # -num
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ jne .Lfrom_mont_nox
+
+ lea (%rax,$num),$rptr
+ call sqrx8x_reduction
+
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_nox:
+___
+$code.=<<___;
+ call sqr8x_reduction
+
+ pxor %xmm0,%xmm0
+ lea 48(%rsp),%rax
+ mov 40(%rsp),%rsi # restore %rsp
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_zero:
+ movdqa %xmm0,16*0(%rax)
+ movdqa %xmm0,16*1(%rax)
+ movdqa %xmm0,16*2(%rax)
+ movdqa %xmm0,16*3(%rax)
+ lea 16*4(%rax),%rax
+ sub \$32,$num
+ jnz .Lfrom_mont_zero
+
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lfrom_epilogue:
+ ret
+.size bn_from_mont8x,.-bn_from_mont8x
+___
+}
+}}}
+
+if ($addx) {{{
+my $bp="%rdx"; # restore original value
+
+$code.=<<___;
+.type bn_mulx4x_mont_gather5,\@function,6
+.align 32
+bn_mulx4x_mont_gather5:
+.Lmulx4x_enter:
+ .byte 0x67
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num # -$num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers a[num], ret[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic. [excessive frame is allocated in order
+ # to allow bn_from_mont8x to clear it.]
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $ap,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lmulx4xsp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ jmp .Lmulx4xsp_done
+
+.align 32
+.Lmulx4xsp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lmulx4xsp_done:
+ and \$-64,%rsp # ensure alignment
+ ##############################################################
+ # Stack layout
+ # +0 -num
+ # +8 off-loaded &b[i]
+ # +16 end of b[num]
+ # +24 inner counter
+ # +32 saved n0
+ # +40 saved %rsp
+ # +48
+ # +56 saved rp
+ # +64 tmp[num+1]
+ #
+ mov $n0, 32(%rsp) # save *n0
+ mov %rax,40(%rsp) # save original %rsp
+.Lmulx4x_body:
+ call mulx4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmulx4x_epilogue:
+ ret
+.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type mulx4x_internal,\@abi-omnipotent
+.align 32
+mulx4x_internal:
+ .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
+ .byte 0x67
+ neg $num # restore $num
+ shl \$5,$num
+ lea 256($bp,$num),%r13
+ shr \$5+5,$num
+ mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ sub \$1,$num
+ mov %r13,16+8(%rsp) # end of b[num]
+ mov $num,24+8(%rsp) # inner counter
+ mov $rp, 56+8(%rsp) # save $rp
+___
+my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
+ ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
+my $STRIDE=2**5*8; # 5 is "window size"
+my $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ add \$7,%r11
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+ and \$7,%r11
+
+ movq `0*$STRIDE/4-96`($bptr),%xmm0
+ lea $STRIDE($bptr),$tptr # borrow $tptr
+ movq `1*$STRIDE/4-96`($bptr),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bptr),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bptr),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ movq `0*$STRIDE/4-96`($tptr),%xmm1
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ movq `1*$STRIDE/4-96`($tptr),%xmm2
+ por %xmm3,%xmm0
+ .byte 0x67,0x67
+ pand %xmm4,%xmm1
+ movq `2*$STRIDE/4-96`($tptr),%xmm3
+
+ movq %xmm0,%rdx # bp[0]
+ movq `3*$STRIDE/4-96`($tptr),%xmm0
+ lea 2*$STRIDE($bptr),$bptr # next &b[i]
+ pand %xmm5,%xmm2
+ .byte 0x67,0x67
+ pand %xmm6,%xmm3
+ ##############################################################
+ # $tptr is chosen so that writing to top-most element of the
+ # vector occurs just "above" references to powers table,
+ # "above" modulo cache-line size, which effectively precludes
+ # possibility of memory disambiguation logic failure when
+ # accessing the table.
+ #
+ lea 64+8*4+8(%rsp,%r11,8),$tptr
+
+ mov %rdx,$bi
+ mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
+ mulx 1*8($aptr),%r11,%r12 # a[1]*b[0]
+ add %rax,%r11
+ mulx 2*8($aptr),%rax,%r13 # ...
+ adc %rax,%r12
+ adc \$0,%r13
+ mulx 3*8($aptr),%rax,%r14
+
+ mov $mi,%r15
+ imulq 32+8(%rsp),$mi # "t[0]"*n0
+ xor $zero,$zero # cf=0, of=0
+ mov $mi,%rdx
+
+ por %xmm2,%xmm1
+ pand %xmm7,%xmm0
+ por %xmm3,%xmm1
+ mov $bptr,8+8(%rsp) # off-load &b[i]
+ por %xmm1,%xmm0
+
+ .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+
+ mulx 0*16($nptr),%rax,%r10
+ adcx %rax,%r15 # discarded
+ adox %r11,%r10
+ mulx 1*16($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ mulx 2*16($nptr),%rax,%r12
+ mov 24+8(%rsp),$bptr # counter value
+ .byte 0x66
+ mov %r10,-8*4($tptr)
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*16($nptr),%rax,%r15
+ .byte 0x67,0x67
+ mov $bi,%rdx
+ mov %r11,-8*3($tptr)
+ adcx %rax,%r12
+ adox $zero,%r15 # of=0
+ .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
+ mov %r12,-8*2($tptr)
+ #jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
+ adcx %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
+ adcx %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ .byte 0x67,0x67
+ mov $mi,%rdx
+ adcx %rax,%r13
+ adcx $zero,%r14 # cf=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+
+ adox %r15,%r10
+ mulx 0*16($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*16($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*16($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ mov %r11,-4*8($tptr)
+ adox %r15,%r13
+ mulx 3*16($nptr),%rax,%r15
+ mov $bi,%rdx
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ lea 4*16($nptr),$nptr
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_1st
+
+ mov 8(%rsp),$num # load -num
+ movq %xmm0,%rdx # bp[1]
+ adc $zero,%r15 # modulo-scheduled
+ lea ($aptr,$num),$aptr # rewind $aptr
+ add %r15,%r14
+ mov 8+8(%rsp),$bptr # re-load &b[i]
+ adc $zero,$zero # top-most carry
+ mov %r14,-1*8($tptr)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ mov $zero,($tptr) # save top-most carry
+ lea 4*8($tptr,$num),$tptr # rewind $tptr
+ mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
+ xor $zero,$zero # cf=0, of=0
+ mov %rdx,$bi
+ mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
+ adox -4*8($tptr),$mi # +t[0]
+ adcx %r14,%r11
+ mulx 2*8($aptr),%r15,%r13 # ...
+ adox -3*8($tptr),%r11
+ adcx %r15,%r12
+ mulx 3*8($aptr),%rdx,%r14
+ adox -2*8($tptr),%r12
+ adcx %rdx,%r13
+ lea ($nptr,$num,2),$nptr # rewind $nptr
+ lea 4*8($aptr),$aptr
+ adox -1*8($tptr),%r13
+ adcx $zero,%r14
+ adox $zero,%r14
+
+ .byte 0x67
+ mov $mi,%r15
+ imulq 32+8(%rsp),$mi # "t[0]"*n0
+
+ movq `0*$STRIDE/4-96`($bptr),%xmm0
+ .byte 0x67,0x67
+ mov $mi,%rdx
+ movq `1*$STRIDE/4-96`($bptr),%xmm1
+ .byte 0x67
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bptr),%xmm2
+ .byte 0x67
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bptr),%xmm3
+ add \$$STRIDE,$bptr # next &b[i]
+ .byte 0x67
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ xor $zero,$zero # cf=0, of=0
+ mov $bptr,8+8(%rsp) # off-load &b[i]
+
+ mulx 0*16($nptr),%rax,%r10
+ adcx %rax,%r15 # discarded
+ adox %r11,%r10
+ mulx 1*16($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+ mulx 2*16($nptr),%rax,%r12
+ adcx %rax,%r11
+ adox %r13,%r12
+ mulx 3*16($nptr),%rax,%r15
+ mov $bi,%rdx
+ por %xmm2,%xmm0
+ mov 24+8(%rsp),$bptr # counter value
+ mov %r10,-8*4($tptr)
+ por %xmm3,%xmm0
+ adcx %rax,%r12
+ mov %r11,-8*3($tptr)
+ adox $zero,%r15 # of=0
+ mov %r12,-8*2($tptr)
+ lea 4*16($nptr),$nptr
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
+ adcx $zero,%r15 # cf=0, modulo-scheduled
+ adox %r14,%r10
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
+ adcx 0*8($tptr),%r10
+ adox %rax,%r11
+ mulx 2*8($aptr),%r12,%rax # ...
+ adcx 1*8($tptr),%r11
+ adox %r14,%r12
+ mulx 3*8($aptr),%r13,%r14
+ mov $mi,%rdx
+ adcx 2*8($tptr),%r12
+ adox %rax,%r13
+ adcx 3*8($tptr),%r13
+ adox $zero,%r14 # of=0
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
+ adcx $zero,%r14 # cf=0
+
+ adox %r15,%r10
+ mulx 0*16($nptr),%rax,%r15
+ adcx %rax,%r10
+ adox %r15,%r11
+ mulx 1*16($nptr),%rax,%r15
+ adcx %rax,%r11
+ adox %r15,%r12
+ mulx 2*16($nptr),%rax,%r15
+ mov %r10,-5*8($tptr)
+ adcx %rax,%r12
+ adox %r15,%r13
+ mov %r11,-4*8($tptr)
+ mulx 3*16($nptr),%rax,%r15
+ mov $bi,%rdx
+ lea 4*16($nptr),$nptr
+ mov %r12,-3*8($tptr)
+ adcx %rax,%r13
+ adox $zero,%r15
+ mov %r13,-2*8($tptr)
+
+ dec $bptr # of=0, pass cf
+ jnz .Lmulx4x_inner
+
+ mov 0+8(%rsp),$num # load -num
+ movq %xmm0,%rdx # bp[i+1]
+ adc $zero,%r15 # modulo-scheduled
+ sub 0*8($tptr),$bptr # pull top-most carry to %cf
+ mov 8+8(%rsp),$bptr # re-load &b[i]
+ mov 16+8(%rsp),%r10
+ adc %r15,%r14
+ lea ($aptr,$num),$aptr # rewind $aptr
+ adc $zero,$zero # top-most carry
+ mov %r14,-1*8($tptr)
+
+ cmp %r10,$bptr
+ jb .Lmulx4x_outer
+
+ mov -16($nptr),%r10
+ xor %r15,%r15
+ sub %r14,%r10 # compare top-most words
+ adc %r15,%r15
+ or %r15,$zero
+ xor \$1,$zero
+ lea ($tptr,$num),%rdi # rewind $tptr
+ lea ($nptr,$num,2),$nptr # rewind $nptr
+ .byte 0x67,0x67
+ sar \$3+2,$num # cf=0
+ lea ($nptr,$zero,8),%rbp
+ mov 56+8(%rsp),%rdx # restore rp
+ mov $num,%rcx
+ jmp .Lsqrx4x_sub # common post-condition
+.size mulx4x_internal,.-mulx4x_internal
+___
+} {
+######################################################################
+# void bn_power5(
+my $rptr="%rdi"; # BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # const void *table,
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 8
+ # int pwr);
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type bn_powerx5,\@function,6
+.align 32
+bn_powerx5:
+.Lpowerx5_enter:
+ .byte 0x67
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num
+ mov ($n0),$n0 # *n0
+
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic.
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lpwrx_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ jmp .Lpwrx_sp_done
+
+.align 32
+.Lpwrx_sp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lpwrx_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +16 intermediate carry bit
+ # +24 top-most carry bit, used in reduction section
+ # +32 saved *n0
+ # +40 saved %rsp
+ # +48 t[2*$num]
+ #
+ pxor %xmm0,%xmm0
+ movq $rptr,%xmm1 # save $rptr
+ movq $nptr,%xmm2 # save $nptr
+ movq %r10, %xmm3 # -$num
+ movq $bptr,%xmm4
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lpowerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+ call __bn_sqrx8x_internal
+
+ mov %r10,$num # -num
+ mov $aptr,$rptr
+ movq %xmm2,$nptr
+ movq %xmm4,$bptr
+ mov 40(%rsp),%rax
+
+ call mulx4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lpowerx5_epilogue:
+ ret
+.size bn_powerx5,.-bn_powerx5
+
+.globl bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.type bn_sqrx8x_internal,\@abi-omnipotent
+.align 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+ ##################################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ ##################################################################
+ # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[2]a[1]
+ # a[3]a[1]
+ # a[3]a[2]
+ #
+ # a[4]a[0]
+ # a[5]a[0]
+ # a[6]a[0]
+ # a[7]a[0]
+ # a[4]a[1]
+ # a[5]a[1]
+ # a[6]a[1]
+ # a[7]a[1]
+ # a[4]a[2]
+ # a[5]a[2]
+ # a[6]a[2]
+ # a[7]a[2]
+ # a[4]a[3]
+ # a[5]a[3]
+ # a[6]a[3]
+ # a[7]a[3]
+ #
+ # a[5]a[4]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+___
+{
+my ($zero,$carry)=("%rbp","%rcx");
+my $aaptr=$zero;
+$code.=<<___;
+ lea 48+8(%rsp),$tptr
+ lea ($aptr,$num),$aaptr
+ mov $num,0+8(%rsp) # save $num
+ mov $aaptr,8+8(%rsp) # save end of $aptr
+ jmp .Lsqr8x_zero_start
+
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+ .byte 0x3e
+ movdqa %xmm0,0*8($tptr)
+ movdqa %xmm0,2*8($tptr)
+ movdqa %xmm0,4*8($tptr)
+ movdqa %xmm0,6*8($tptr)
+.Lsqr8x_zero_start: # aligned at 32
+ movdqa %xmm0,8*8($tptr)
+ movdqa %xmm0,10*8($tptr)
+ movdqa %xmm0,12*8($tptr)
+ movdqa %xmm0,14*8($tptr)
+ lea 16*8($tptr),$tptr
+ sub \$64,$num
+ jnz .Lsqrx8x_zero
+
+ mov 0*8($aptr),%rdx # a[0], modulo-scheduled
+ #xor %r9,%r9 # t[1], ex-$num, zero already
+ xor %r10,%r10
+ xor %r11,%r11
+ xor %r12,%r12
+ xor %r13,%r13
+ xor %r14,%r14
+ xor %r15,%r15
+ lea 48+8(%rsp),$tptr
+ xor $zero,$zero # cf=0, cf=0
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_loop:
+ mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
+ adcx %r9,%r8 # a[1]*a[0]+=t[1]
+ adox %rax,%r10
+ mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
+ adcx %r10,%r9
+ adox %rax,%r11
+ .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
+ adcx %r11,%r10
+ adox %rax,%r12
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
+ adcx %r12,%r11
+ adox %rax,%r13
+ mulx 5*8($aptr),%r12,%rax
+ adcx %r13,%r12
+ adox %rax,%r14
+ mulx 6*8($aptr),%r13,%rax
+ adcx %r14,%r13
+ adox %r15,%rax
+ mulx 7*8($aptr),%r14,%r15
+ mov 1*8($aptr),%rdx # a[1]
+ adcx %rax,%r14
+ adox $zero,%r15
+ adc 8*8($tptr),%r15
+ mov %r8,1*8($tptr) # t[1]
+ mov %r9,2*8($tptr) # t[2]
+ sbb $carry,$carry # mov %cf,$carry
+ xor $zero,$zero # cf=0, of=0
+
+
+ mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
+ mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
+ adcx %r10,%r8
+ adox %rbx,%r9
+ mulx 4*8($aptr),%r10,%rbx # ...
+ adcx %r11,%r9
+ adox %rax,%r10
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax
+ adcx %r12,%r10
+ adox %rbx,%r11
+ .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx
+ adcx %r13,%r11
+ adox %r14,%r12
+ .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14
+ mov 2*8($aptr),%rdx # a[2]
+ adcx %rax,%r12
+ adox %rbx,%r13
+ adcx %r15,%r13
+ adox $zero,%r14 # of=0
+ adcx $zero,%r14 # cf=0
+
+ mov %r8,3*8($tptr) # t[3]
+ mov %r9,4*8($tptr) # t[4]
+
+ mulx 3*8($aptr),%r8,%rbx # a[3]*a[2]
+ mulx 4*8($aptr),%r9,%rax # a[4]*a[2]
+ adcx %r10,%r8
+ adox %rbx,%r9
+ mulx 5*8($aptr),%r10,%rbx # ...
+ adcx %r11,%r9
+ adox %rax,%r10
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax
+ adcx %r12,%r10
+ adox %r13,%r11
+ .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13
+ .byte 0x3e
+ mov 3*8($aptr),%rdx # a[3]
+ adcx %rbx,%r11
+ adox %rax,%r12
+ adcx %r14,%r12
+ mov %r8,5*8($tptr) # t[5]
+ mov %r9,6*8($tptr) # t[6]
+ mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
+ adox $zero,%r13 # of=0
+ adcx $zero,%r13 # cf=0
+
+ mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
+ adcx %r10,%r8
+ adox %rax,%r9
+ mulx 6*8($aptr),%r10,%rax # ...
+ adcx %r11,%r9
+ adox %r12,%r10
+ mulx 7*8($aptr),%r11,%r12
+ mov 4*8($aptr),%rdx # a[4]
+ mov 5*8($aptr),%r14 # a[5]
+ adcx %rbx,%r10
+ adox %rax,%r11
+ mov 6*8($aptr),%r15 # a[6]
+ adcx %r13,%r11
+ adox $zero,%r12 # of=0
+ adcx $zero,%r12 # cf=0
+
+ mov %r8,7*8($tptr) # t[7]
+ mov %r9,8*8($tptr) # t[8]
+
+ mulx %r14,%r9,%rax # a[5]*a[4]
+ mov 7*8($aptr),%r8 # a[7]
+ adcx %r10,%r9
+ mulx %r15,%r10,%rbx # a[6]*a[4]
+ adox %rax,%r10
+ adcx %r11,%r10
+ mulx %r8,%r11,%rax # a[7]*a[4]
+ mov %r14,%rdx # a[5]
+ adox %rbx,%r11
+ adcx %r12,%r11
+ #adox $zero,%rax # of=0
+ adcx $zero,%rax # cf=0
+
+ mulx %r15,%r14,%rbx # a[6]*a[5]
+ mulx %r8,%r12,%r13 # a[7]*a[5]
+ mov %r15,%rdx # a[6]
+ lea 8*8($aptr),$aptr
+ adcx %r14,%r11
+ adox %rbx,%r12
+ adcx %rax,%r12
+ adox $zero,%r13
+
+ .byte 0x67,0x67
+ mulx %r8,%r8,%r14 # a[7]*a[6]
+ adcx %r8,%r13
+ adcx $zero,%r14
+
+ cmp 8+8(%rsp),$aptr
+ je .Lsqrx8x_outer_break
+
+ neg $carry # mov $carry,%cf
+ mov \$-8,%rcx
+ mov $zero,%r15
+ mov 8*8($tptr),%r8
+ adcx 9*8($tptr),%r9 # +=t[9]
+ adcx 10*8($tptr),%r10 # ...
+ adcx 11*8($tptr),%r11
+ adc 12*8($tptr),%r12
+ adc 13*8($tptr),%r13
+ adc 14*8($tptr),%r14
+ adc 15*8($tptr),%r15
+ lea ($aptr),$aaptr
+ lea 2*64($tptr),$tptr
+ sbb %rax,%rax # mov %cf,$carry
+
+ mov -64($aptr),%rdx # a[0]
+ mov %rax,16+8(%rsp) # offload $carry
+ mov $tptr,24+8(%rsp)
+
+ #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
+ xor %eax,%eax # cf=0, of=0
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_loop:
+ mov %r8,%rbx
+ mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i]
+ adcx %rax,%rbx # +=t[8]
+ adox %r9,%r8
+
+ mulx 1*8($aaptr),%rax,%r9 # ...
+ adcx %rax,%r8
+ adox %r10,%r9
+
+ mulx 2*8($aaptr),%rax,%r10
+ adcx %rax,%r9
+ adox %r11,%r10
+
+ mulx 3*8($aaptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12
+ adcx %rax,%r11
+ adox %r13,%r12
+
+ mulx 5*8($aaptr),%rax,%r13
+ adcx %rax,%r12
+ adox %r14,%r13
+
+ mulx 6*8($aaptr),%rax,%r14
+ mov %rbx,($tptr,%rcx,8) # store t[8+i]
+ mov \$0,%ebx
+ adcx %rax,%r13
+ adox %r15,%r14
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15
+ mov 8($aptr,%rcx,8),%rdx # a[i]
+ adcx %rax,%r14
+ adox %rbx,%r15 # %rbx is 0, of=0
+ adcx %rbx,%r15 # cf=0
+
+ .byte 0x67
+ inc %rcx # of=0
+ jnz .Lsqrx8x_loop
+
+ lea 8*8($aaptr),$aaptr
+ mov \$-8,%rcx
+ cmp 8+8(%rsp),$aaptr # done?
+ je .Lsqrx8x_break
+
+ sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
+ .byte 0x66
+ mov -64($aptr),%rdx
+ adcx 0*8($tptr),%r8
+ adcx 1*8($tptr),%r9
+ adc 2*8($tptr),%r10
+ adc 3*8($tptr),%r11
+ adc 4*8($tptr),%r12
+ adc 5*8($tptr),%r13
+ adc 6*8($tptr),%r14
+ adc 7*8($tptr),%r15
+ lea 8*8($tptr),$tptr
+ .byte 0x67
+ sbb %rax,%rax # mov %cf,%rax
+ xor %ebx,%ebx # cf=0, of=0
+ mov %rax,16+8(%rsp) # offload carry
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_break:
+ sub 16+8(%rsp),%r8 # consume last carry
+ mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
+ mov 0*8($aptr),%rdx # a[8], modulo-scheduled
+ xor %ebp,%ebp # xor $zero,$zero
+ mov %r8,0*8($tptr)
+ cmp $carry,$tptr # cf=0, of=0
+ je .Lsqrx8x_outer_loop
+
+ mov %r9,1*8($tptr)
+ mov 1*8($carry),%r9
+ mov %r10,2*8($tptr)
+ mov 2*8($carry),%r10
+ mov %r11,3*8($tptr)
+ mov 3*8($carry),%r11
+ mov %r12,4*8($tptr)
+ mov 4*8($carry),%r12
+ mov %r13,5*8($tptr)
+ mov 5*8($carry),%r13
+ mov %r14,6*8($tptr)
+ mov 6*8($carry),%r14
+ mov %r15,7*8($tptr)
+ mov 7*8($carry),%r15
+ mov $carry,$tptr
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_break:
+ mov %r9,9*8($tptr) # t[9]
+ movq %xmm3,%rcx # -$num
+ mov %r10,10*8($tptr) # ...
+ mov %r11,11*8($tptr)
+ mov %r12,12*8($tptr)
+ mov %r13,13*8($tptr)
+ mov %r14,14*8($tptr)
+___
+} {
+my $i="%rcx";
+$code.=<<___;
+ lea 48+8(%rsp),$tptr
+ mov ($aptr,$i),%rdx # a[0]
+
+ mov 8($tptr),$A0[1] # t[1]
+ xor $A0[0],$A0[0] # t[0], of=0, cf=0
+ mov 0+8(%rsp),$num # restore $num
+ adox $A0[1],$A0[1]
+ mov 16($tptr),$A1[0] # t[2] # prefetch
+ mov 24($tptr),$A1[1] # t[3] # prefetch
+ #jmp .Lsqrx4x_shift_n_add # happens to be aligned
+
+.align 32
+.Lsqrx4x_shift_n_add:
+ mulx %rdx,%rax,%rbx
+ adox $A1[0],$A1[0]
+ adcx $A0[0],%rax
+ .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch
+ .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch
+ adox $A1[1],$A1[1]
+ adcx $A0[1],%rbx
+ mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch
+ mov %rax,0($tptr)
+ mov %rbx,8($tptr)
+
+ mulx %rdx,%rax,%rbx
+ adox $A0[0],$A0[0]
+ adcx $A1[0],%rax
+ mov 16($aptr,$i),%rdx # a[i+2] # prefetch
+ mov 48($tptr),$A1[0] # t[2*i+6] # prefetch
+ adox $A0[1],$A0[1]
+ adcx $A1[1],%rbx
+ mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch
+ mov %rax,16($tptr)
+ mov %rbx,24($tptr)
+
+ mulx %rdx,%rax,%rbx
+ adox $A1[0],$A1[0]
+ adcx $A0[0],%rax
+ mov 24($aptr,$i),%rdx # a[i+3] # prefetch
+ lea 32($i),$i
+ mov 64($tptr),$A0[0] # t[2*i+8] # prefetch
+ adox $A1[1],$A1[1]
+ adcx $A0[1],%rbx
+ mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch
+ mov %rax,32($tptr)
+ mov %rbx,40($tptr)
+
+ mulx %rdx,%rax,%rbx
+ adox $A0[0],$A0[0]
+ adcx $A1[0],%rax
+ jrcxz .Lsqrx4x_shift_n_add_break
+ .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch
+ adox $A0[1],$A0[1]
+ adcx $A1[1],%rbx
+ mov 80($tptr),$A1[0] # t[2*i+10] # prefetch
+ mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch
+ mov %rax,48($tptr)
+ mov %rbx,56($tptr)
+ lea 64($tptr),$tptr
+ nop
+ jmp .Lsqrx4x_shift_n_add
+
+.align 32
+.Lsqrx4x_shift_n_add_break:
+ adcx $A1[1],%rbx
+ mov %rax,48($tptr)
+ mov %rbx,56($tptr)
+ lea 64($tptr),$tptr # end of t[] buffer
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
+
+$code.=<<___;
+ movq %xmm2,$nptr
+sqrx8x_reduction:
+ xor %eax,%eax # initial top-most carry bit
+ mov 32+8(%rsp),%rbx # n0
+ mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
+ lea -128($nptr,$num,2),%rcx # end of n[]
+ #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
+ mov %rcx, 0+8(%rsp) # save end of n[]
+ mov $tptr,8+8(%rsp) # save end of t[]
+
+ lea 48+8(%rsp),$tptr # initial t[] window
+ jmp .Lsqrx8x_reduction_loop
+
+.align 32
+.Lsqrx8x_reduction_loop:
+ mov 8*1($tptr),%r9
+ mov 8*2($tptr),%r10
+ mov 8*3($tptr),%r11
+ mov 8*4($tptr),%r12
+ mov %rdx,%r8
+ imulq %rbx,%rdx # n0*a[i]
+ mov 8*5($tptr),%r13
+ mov 8*6($tptr),%r14
+ mov 8*7($tptr),%r15
+ mov %rax,24+8(%rsp) # store top-most carry bit
+
+ lea 8*8($tptr),$tptr
+ xor $carry,$carry # cf=0,of=0
+ mov \$-8,%rcx
+ jmp .Lsqrx8x_reduce
+
+.align 32
+.Lsqrx8x_reduce:
+ mov %r8, %rbx
+ mulx 16*0($nptr),%rax,%r8 # n[0]
+ adcx %rbx,%rax # discarded
+ adox %r9,%r8
+
+ mulx 16*1($nptr),%rbx,%r9 # n[1]
+ adcx %rbx,%r8
+ adox %r10,%r9
+
+ mulx 16*2($nptr),%rbx,%r10
+ adcx %rbx,%r9
+ adox %r11,%r10
+
+ mulx 16*3($nptr),%rbx,%r11
+ adcx %rbx,%r10
+ adox %r12,%r11
+
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
+ mov %rdx,%rax
+ mov %r8,%rdx
+ adcx %rbx,%r11
+ adox %r13,%r12
+
+ mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded
+ mov %rax,%rdx
+ mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
+
+ mulx 16*5($nptr),%rax,%r13
+ adcx %rax,%r12
+ adox %r14,%r13
+
+ mulx 16*6($nptr),%rax,%r14
+ adcx %rax,%r13
+ adox %r15,%r14
+
+ mulx 16*7($nptr),%rax,%r15
+ mov %rbx,%rdx
+ adcx %rax,%r14
+ adox $carry,%r15 # $carry is 0
+ adcx $carry,%r15 # cf=0
+
+ .byte 0x67,0x67,0x67
+ inc %rcx # of=0
+ jnz .Lsqrx8x_reduce
+
+ mov $carry,%rax # xor %rax,%rax
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .Lsqrx8x_no_tail
+
+ mov 48+8(%rsp),%rdx # pull n0*a[0]
+ add 8*0($tptr),%r8
+ lea 16*8($nptr),$nptr
+ mov \$-8,%rcx
+ adcx 8*1($tptr),%r9
+ adcx 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ lea 8*8($tptr),$tptr
+ sbb %rax,%rax # top carry
+
+ xor $carry,$carry # of=0, cf=0
+ mov %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail:
+ mov %r8,%rbx
+ mulx 16*0($nptr),%rax,%r8
+ adcx %rax,%rbx
+ adox %r9,%r8
+
+ mulx 16*1($nptr),%rax,%r9
+ adcx %rax,%r8
+ adox %r10,%r9
+
+ mulx 16*2($nptr),%rax,%r10
+ adcx %rax,%r9
+ adox %r11,%r10
+
+ mulx 16*3($nptr),%rax,%r11
+ adcx %rax,%r10
+ adox %r12,%r11
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
+ adcx %rax,%r11
+ adox %r13,%r12
+
+ mulx 16*5($nptr),%rax,%r13
+ adcx %rax,%r12
+ adox %r14,%r13
+
+ mulx 16*6($nptr),%rax,%r14
+ adcx %rax,%r13
+ adox %r15,%r14
+
+ mulx 16*7($nptr),%rax,%r15
+ mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
+ adcx %rax,%r14
+ adox $carry,%r15
+ mov %rbx,($tptr,%rcx,8) # save result
+ mov %r8,%rbx
+ adcx $carry,%r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Lsqrx8x_tail
+
+ cmp 0+8(%rsp),$nptr # end of n[]?
+ jae .Lsqrx8x_tail_done # break out of loop
+
+ sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
+ mov 48+8(%rsp),%rdx # pull n0*a[0]
+ lea 16*8($nptr),$nptr
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ lea 8*8($tptr),$tptr
+ sbb %rax,%rax
+ sub \$8,%rcx # mov \$-8,%rcx
+
+ xor $carry,$carry # of=0, cf=0
+ mov %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail_done:
+ add 24+8(%rsp),%r8 # can this overflow?
+ mov $carry,%rax # xor %rax,%rax
+
+ sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
+.Lsqrx8x_no_tail: # %cf is 0 if jumped here
+ adc 8*0($tptr),%r8
+ movq %xmm3,%rcx
+ adc 8*1($tptr),%r9
+ mov 16*7($nptr),$carry
+ movq %xmm2,$nptr # restore $nptr
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ adc %rax,%rax # top-most carry
+
+ mov 32+8(%rsp),%rbx # n0
+ mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
+
+ mov %r8,8*0($tptr) # store top 512 bits
+ lea 8*8($tptr),%r8 # borrow %r8
+ mov %r9,8*1($tptr)
+ mov %r10,8*2($tptr)
+ mov %r11,8*3($tptr)
+ mov %r12,8*4($tptr)
+ mov %r13,8*5($tptr)
+ mov %r14,8*6($tptr)
+ mov %r15,8*7($tptr)
+
+ lea 8*8($tptr,%rcx),$tptr # start of current t[] window
+ cmp 8+8(%rsp),%r8 # end of t[]?
+ jb .Lsqrx8x_reduction_loop
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($rptr,$nptr)=("%rdx","%rbp");
+my @ri=map("%r$_",(10..13));
+my @ni=map("%r$_",(14..15));
+$code.=<<___;
+ xor %rbx,%rbx
+ sub %r15,%rsi # compare top-most words
+ adc %rbx,%rbx
+ mov %rcx,%r10 # -$num
+ .byte 0x67
+ or %rbx,%rax
+ .byte 0x67
+ mov %rcx,%r9 # -$num
+ xor \$1,%rax
+ sar \$3+2,%rcx # cf=0
+ #lea 48+8(%rsp,%r9),$tptr
+ lea ($nptr,%rax,8),$nptr
+ movq %xmm1,$rptr # restore $rptr
+ movq %xmm1,$aptr # prepare for back-to-back call
+ jmp .Lsqrx4x_sub
+
+.align 32
+.Lsqrx4x_sub:
+ .byte 0x66
+ mov 8*0($tptr),%r12
+ mov 8*1($tptr),%r13
+ sbb 16*0($nptr),%r12
+ mov 8*2($tptr),%r14
+ sbb 16*1($nptr),%r13
+ mov 8*3($tptr),%r15
+ lea 8*4($tptr),$tptr
+ sbb 16*2($nptr),%r14
+ mov %r12,8*0($rptr)
+ sbb 16*3($nptr),%r15
+ lea 16*4($nptr),$nptr
+ mov %r13,8*1($rptr)
+ mov %r14,8*2($rptr)
+ mov %r15,8*3($rptr)
+ lea 8*4($rptr),$rptr
+
+ inc %rcx
+ jnz .Lsqrx4x_sub
+___
+}
+$code.=<<___;
+ neg %r9 # restore $num
+
+ ret
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+___
+}}}
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
+ ("%rdi","%esi","%rdx","%ecx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl bn_scatter5
+.type bn_scatter5,\@abi-omnipotent
+.align 16
+bn_scatter5:
+ cmp \$0, $num
+ jz .Lscatter_epilogue
+ lea ($tbl,$idx,8),$tbl
+.Lscatter:
+ mov ($inp),%rax
+ lea 8($inp),$inp
+ mov %rax,($tbl)
+ lea 32*8($tbl),$tbl
+ sub \$1,$num
+ jnz .Lscatter
+.Lscatter_epilogue:
+ ret
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type bn_gather5,\@abi-omnipotent
+.align 16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ mov $idx,%r11d
+ shr \$`log($N/8)/log(2)`,$idx
+ and \$`$N/8-1`,%r11
+ not $idx
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
+ lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
+ movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,$idx,8),%xmm5 # cache line contains element
+ movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,$idx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq `0*$STRIDE/4-128`($tbl),%xmm0
+ movq `1*$STRIDE/4-128`($tbl),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-128`($tbl),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-128`($tbl),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ .byte 0x67,0x67
+ por %xmm2,%xmm0
+ lea $STRIDE($tbl),$tbl
+ por %xmm3,%xmm0
+
+ movq %xmm0,($out) # m0=bp[0]
+ lea 8($out),$out
+ sub \$1,$num
+ jnz .Lgather
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ lea 0x28(%rsp),%rsp
+___
+$code.=<<___;
+ ret
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align 64
+.Lmagic_masks:
+ .long 0,0, 0,0, 0,0, -1,-1
+ .long 0,0, 0,0, 0,0, 0,0
+.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type mul_handler,\@abi-omnipotent
+.align 16
+mul_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea .Lmul_epilogue(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lbody_40
+
+ mov 192($context),%r10 # pull $num
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+ jmp .Lbody_proceed
+
+.Lbody_40:
+ mov 40(%rax),%rax # pull saved stack pointer
+.Lbody_proceed:
+
+ movaps -88(%rax),%xmm0
+ movaps -72(%rax),%xmm1
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+ movups %xmm0,512($context) # restore context->Xmm6
+ movups %xmm1,528($context) # restore context->Xmm7
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size mul_handler,.-mul_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_bn_mul_mont_gather5
+ .rva .LSEH_end_bn_mul_mont_gather5
+ .rva .LSEH_info_bn_mul_mont_gather5
+
+ .rva .LSEH_begin_bn_mul4x_mont_gather5
+ .rva .LSEH_end_bn_mul4x_mont_gather5
+ .rva .LSEH_info_bn_mul4x_mont_gather5
+
+ .rva .LSEH_begin_bn_power5
+ .rva .LSEH_end_bn_power5
+ .rva .LSEH_info_bn_power5
+
+ .rva .LSEH_begin_bn_from_mont8x
+ .rva .LSEH_end_bn_from_mont8x
+ .rva .LSEH_info_bn_from_mont8x
+___
+$code.=<<___ if ($addx);
+ .rva .LSEH_begin_bn_mulx4x_mont_gather5
+ .rva .LSEH_end_bn_mulx4x_mont_gather5
+ .rva .LSEH_info_bn_mulx4x_mont_gather5
+
+ .rva .LSEH_begin_bn_powerx5
+ .rva .LSEH_end_bn_powerx5
+ .rva .LSEH_info_bn_powerx5
+___
+$code.=<<___;
+ .rva .LSEH_begin_bn_gather5
+ .rva .LSEH_end_bn_gather5
+ .rva .LSEH_info_bn_gather5
+
+.section .xdata
+.align 8
+.LSEH_info_bn_mul_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_mul4x_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_power5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_from_mont8x:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
+___
+$code.=<<___ if ($addx);
+.align 8
+.LSEH_info_bn_mulx4x_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_powerx5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
+___
+$code.=<<___;
+.align 8
+.LSEH_info_bn_gather5:
+ .byte 0x01,0x0d,0x05,0x00
+ .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
+ .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+.align 8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/src/crypto/bn/bn.c b/src/crypto/bn/bn.c
new file mode 100644
index 0000000..368c4f1
--- /dev/null
+++ b/src/crypto/bn/bn.c
@@ -0,0 +1,338 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <limits.h>
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+BIGNUM *BN_new(void) {
+ BIGNUM *bn = OPENSSL_malloc(sizeof(BIGNUM));
+
+ if (bn == NULL) {
+ OPENSSL_PUT_ERROR(BN, BN_new, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ memset(bn, 0, sizeof(BIGNUM));
+ bn->flags = BN_FLG_MALLOCED;
+
+ return bn;
+}
+
+void BN_init(BIGNUM *bn) {
+ memset(bn, 0, sizeof(BIGNUM));
+}
+
+void BN_free(BIGNUM *bn) {
+ if (bn == NULL) {
+ return;
+ }
+
+ if (bn->d != NULL && (bn->flags & BN_FLG_STATIC_DATA) == 0) {
+ OPENSSL_free(bn->d);
+ }
+
+ if (bn->flags & BN_FLG_MALLOCED) {
+ OPENSSL_free(bn);
+ } else {
+ bn->d = NULL;
+ }
+}
+
+void BN_clear_free(BIGNUM *bn) {
+ char should_free;
+
+ if (bn == NULL) {
+ return;
+ }
+
+ if (bn->d != NULL) {
+ OPENSSL_cleanse(bn->d, bn->dmax * sizeof(bn->d[0]));
+ if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
+ OPENSSL_free(bn->d);
+ }
+ }
+
+ should_free = (bn->flags & BN_FLG_MALLOCED) != 0;
+ OPENSSL_cleanse(bn, sizeof(BIGNUM));
+ if (should_free) {
+ OPENSSL_free(bn);
+ }
+}
+
+BIGNUM *BN_dup(const BIGNUM *src) {
+ BIGNUM *copy;
+
+ if (src == NULL) {
+ return NULL;
+ }
+
+ copy = BN_new();
+ if (copy == NULL) {
+ return NULL;
+ }
+
+ if (!BN_copy(copy, src)) {
+ BN_free(copy);
+ return NULL;
+ }
+
+ return copy;
+}
+
+BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src) {
+ if (src == dest) {
+ return dest;
+ }
+
+ if (bn_wexpand(dest, src->top) == NULL) {
+ return NULL;
+ }
+
+ memcpy(dest->d, src->d, sizeof(src->d[0]) * src->top);
+
+ dest->top = src->top;
+ dest->neg = src->neg;
+ return dest;
+}
+
+void BN_clear(BIGNUM *bn) {
+ if (bn->d != NULL) {
+ memset(bn->d, 0, bn->dmax * sizeof(bn->d[0]));
+ }
+
+ bn->top = 0;
+ bn->neg = 0;
+}
+
+const BIGNUM *BN_value_one(void) {
+ static const BN_ULONG data_one = 1;
+ static const BIGNUM const_one = {(BN_ULONG *)&data_one, 1, 1, 0,
+ BN_FLG_STATIC_DATA};
+
+ return &const_one;
+}
+
+void BN_with_flags(BIGNUM *out, const BIGNUM *in, int flags) {
+ memcpy(out, in, sizeof(BIGNUM));
+ out->flags &= ~BN_FLG_MALLOCED;
+ out->flags |= BN_FLG_STATIC_DATA | flags;
+}
+
+/* BN_num_bits_word returns the minimum number of bits needed to represent the
+ * value in |l|. */
+unsigned BN_num_bits_word(BN_ULONG l) {
+ static const unsigned char bits[256] = {
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+
+#if defined(OPENSSL_64_BIT)
+ if (l & 0xffffffff00000000L) {
+ if (l & 0xffff000000000000L) {
+ if (l & 0xff00000000000000L) {
+ return (bits[(int)(l >> 56)] + 56);
+ } else
+ return (bits[(int)(l >> 48)] + 48);
+ } else {
+ if (l & 0x0000ff0000000000L) {
+ return (bits[(int)(l >> 40)] + 40);
+ } else
+ return (bits[(int)(l >> 32)] + 32);
+ }
+ } else
+#endif
+ {
+ if (l & 0xffff0000L) {
+ if (l & 0xff000000L) {
+ return (bits[(int)(l >> 24L)] + 24);
+ } else {
+ return (bits[(int)(l >> 16L)] + 16);
+ }
+ } else {
+ if (l & 0xff00L) {
+ return (bits[(int)(l >> 8)] + 8);
+ } else {
+ return (bits[(int)(l)]);
+ }
+ }
+ }
+}
+
+unsigned BN_num_bits(const BIGNUM *bn) {
+ const int max = bn->top - 1;
+
+ if (BN_is_zero(bn)) {
+ return 0;
+ }
+
+ return max*BN_BITS2 + BN_num_bits_word(bn->d[max]);
+}
+
+unsigned BN_num_bytes(const BIGNUM *bn) {
+ return (BN_num_bits(bn) + 7) / 8;
+}
+
+void BN_zero(BIGNUM *bn) {
+ bn->top = bn->neg = 0;
+}
+
+int BN_one(BIGNUM *bn) {
+ return BN_set_word(bn, 1);
+}
+
+int BN_set_word(BIGNUM *bn, BN_ULONG value) {
+ if (value == 0) {
+ BN_zero(bn);
+ return 1;
+ }
+
+ if (bn_wexpand(bn, 1) == NULL) {
+ return 0;
+ }
+
+ bn->neg = 0;
+ bn->d[0] = value;
+ bn->top = 1;
+ return 1;
+}
+
+int BN_is_negative(const BIGNUM *bn) {
+ return bn->neg != 0;
+}
+
+void BN_set_negative(BIGNUM *bn, int sign) {
+ if (sign && !BN_is_zero(bn)) {
+ bn->neg = 1;
+ } else {
+ bn->neg = 0;
+ }
+}
+
+BIGNUM *bn_wexpand(BIGNUM *bn, unsigned words) {
+ BN_ULONG *a;
+
+ if (words <= (unsigned) bn->dmax) {
+ return bn;
+ }
+
+ if (words > (INT_MAX / (4 * BN_BITS2))) {
+ OPENSSL_PUT_ERROR(BN, bn_wexpand, BN_R_BIGNUM_TOO_LONG);
+ return NULL;
+ }
+
+ if (bn->flags & BN_FLG_STATIC_DATA) {
+ OPENSSL_PUT_ERROR(BN, bn_wexpand, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
+ return NULL;
+ }
+
+ a = (BN_ULONG *)OPENSSL_malloc(sizeof(BN_ULONG) * words);
+ if (a == NULL) {
+ OPENSSL_PUT_ERROR(BN, bn_wexpand, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ memcpy(a, bn->d, sizeof(BN_ULONG) * bn->top);
+
+ if (bn->d) {
+ OPENSSL_free(bn->d);
+ }
+ bn->d = a;
+ bn->dmax = words;
+
+ return bn;
+}
+
+BIGNUM *bn_expand(BIGNUM *bn, unsigned bits) {
+ return bn_wexpand(bn, (bits+BN_BITS2-1)/BN_BITS2);
+}
+
+void bn_correct_top(BIGNUM *bn) {
+ BN_ULONG *ftl;
+ int tmp_top = bn->top;
+
+ if (tmp_top > 0) {
+ for (ftl = &(bn->d[tmp_top - 1]); tmp_top > 0; tmp_top--) {
+ if (*(ftl--)) {
+ break;
+ }
+ }
+ bn->top = tmp_top;
+ }
+}
+
+int BN_get_flags(const BIGNUM *bn, int flags) {
+ return bn->flags & flags;
+}
+
+void BN_set_flags(BIGNUM *bn, int flags) {
+ bn->flags |= flags;
+}
diff --git a/src/crypto/bn/bn_error.c b/src/crypto/bn/bn_error.c
new file mode 100644
index 0000000..b522c2a
--- /dev/null
+++ b/src/crypto/bn/bn_error.c
@@ -0,0 +1,63 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/err.h>
+
+#include <openssl/bn.h>
+
+const ERR_STRING_DATA BN_error_string_data[] = {
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_get, 0), "BN_CTX_get"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_new, 0), "BN_CTX_new"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_start, 0), "BN_CTX_start"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_bn2dec, 0), "BN_bn2dec"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_bn2hex, 0), "BN_bn2hex"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_div, 0), "BN_div"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_div_recp, 0), "BN_div_recp"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_exp, 0), "BN_exp"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_generate_dsa_nonce, 0), "BN_generate_dsa_nonce"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_generate_prime_ex, 0), "BN_generate_prime_ex"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp2_mont, 0), "BN_mod_exp2_mont"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp_mont, 0), "BN_mod_exp_mont"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp_mont_consttime, 0), "BN_mod_exp_mont_consttime"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp_mont_word, 0), "BN_mod_exp_mont_word"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_inverse, 0), "BN_mod_inverse"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_inverse_no_branch, 0), "BN_mod_inverse_no_branch"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_lshift_quick, 0), "BN_mod_lshift_quick"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_sqrt, 0), "BN_mod_sqrt"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_new, 0), "BN_new"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_rand, 0), "BN_rand"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_rand_range, 0), "BN_rand_range"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_sqrt, 0), "BN_sqrt"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_BN_usub, 0), "BN_usub"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_bn_wexpand, 0), "bn_wexpand"},
+ {ERR_PACK(ERR_LIB_BN, BN_F_mod_exp_recp, 0), "mod_exp_recp"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_ARG2_LT_ARG3), "ARG2_LT_ARG3"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_BAD_RECIPROCAL), "BAD_RECIPROCAL"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_BIGNUM_TOO_LONG), "BIGNUM_TOO_LONG"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_BITS_TOO_SMALL), "BITS_TOO_SMALL"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_CALLED_WITH_EVEN_MODULUS), "CALLED_WITH_EVEN_MODULUS"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_DIV_BY_ZERO), "DIV_BY_ZERO"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA), "EXPAND_ON_STATIC_BIGNUM_DATA"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_INPUT_NOT_REDUCED), "INPUT_NOT_REDUCED"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_INVALID_RANGE), "INVALID_RANGE"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_NEGATIVE_NUMBER), "NEGATIVE_NUMBER"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_NOT_A_SQUARE), "NOT_A_SQUARE"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_NOT_INITIALIZED), "NOT_INITIALIZED"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_NO_INVERSE), "NO_INVERSE"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_PRIVATE_KEY_TOO_LARGE), "PRIVATE_KEY_TOO_LARGE"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_P_IS_NOT_PRIME), "P_IS_NOT_PRIME"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_TOO_MANY_ITERATIONS), "TOO_MANY_ITERATIONS"},
+ {ERR_PACK(ERR_LIB_BN, 0, BN_R_TOO_MANY_TEMPORARY_VARIABLES), "TOO_MANY_TEMPORARY_VARIABLES"},
+ {0, NULL},
+};
diff --git a/src/crypto/bn/bn_test.c b/src/crypto/bn/bn_test.c
new file mode 100644
index 0000000..e342ed8
--- /dev/null
+++ b/src/crypto/bn/bn_test.c
@@ -0,0 +1,1471 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the Eric Young open source
+ * license provided above.
+ *
+ * The binary polynomial arithmetic software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <openssl/bio.h>
+#include <openssl/bn.h>
+#include <openssl/crypto.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+static const int num0 = 100; /* number of tests */
+static const int num1 = 50; /* additional tests for some functions */
+static const int num2 = 5; /* number of tests for slow functions */
+
+int test_add(BIO *bp);
+int test_sub(BIO *bp);
+int test_lshift1(BIO *bp);
+int test_lshift(BIO *bp, BN_CTX *ctx, BIGNUM *a_);
+int test_rshift1(BIO *bp);
+int test_rshift(BIO *bp, BN_CTX *ctx);
+int test_sqr(BIO *bp, BN_CTX *ctx);
+int test_mul(BIO *bp);
+int test_div(BIO *bp, BN_CTX *ctx);
+int rand_neg(void);
+
+int test_div_word(BIO *bp);
+int test_mont(BIO *bp, BN_CTX *ctx);
+int test_mod(BIO *bp, BN_CTX *ctx);
+int test_mod_mul(BIO *bp, BN_CTX *ctx);
+int test_mod_exp(BIO *bp, BN_CTX *ctx);
+int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx);
+int test_exp(BIO *bp, BN_CTX *ctx);
+int test_mod_sqrt(BIO *bp, BN_CTX *ctx);
+static int test_exp_mod_zero(void);
+int test_small_prime(BIO *bp,BN_CTX *ctx);
+int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx);
+int test_sqrt(BIO *bp, BN_CTX *ctx);
+int test_bn2bin_padded(BIO *bp, BN_CTX *ctx);
+#if 0
+int test_gf2m_add(BIO *bp);
+int test_gf2m_mod(BIO *bp);
+int test_gf2m_mod_mul(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_sqr(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_inv(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_div(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_exp(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_sqrt(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_solve_quad(BIO *bp, BN_CTX *ctx);
+#endif
+static int results = 0;
+
+static unsigned char lst[] =
+ "\xC6\x4F\x43\x04\x2A\xEA\xCA\x6E\x58\x36\x80\x5B\xE8\xC9"
+ "\x9B\x04\x5D\x48\x36\xC2\xFD\x16\xC9\x64\xF0";
+
+static void ERR_print_errors_fp(FILE *out) {
+}
+
+static void message(BIO *out, char *m) {
+ BIO_puts(out, "print \"test ");
+ BIO_puts(out, m);
+ BIO_puts(out, "\\n\"\n");
+}
+
+int main(int argc, char *argv[]) {
+ BN_CTX *ctx;
+ BIO *out = NULL;
+ char *outfile = NULL;
+
+ CRYPTO_library_init();
+
+ results = 0;
+
+ argc--;
+ argv++;
+ while (argc >= 1) {
+ if (strcmp(*argv, "-results") == 0)
+ results = 1;
+ else if (strcmp(*argv, "-out") == 0) {
+ if (--argc < 1)
+ break;
+ outfile = *(++argv);
+ }
+ argc--;
+ argv++;
+ }
+
+
+ ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 1;
+
+ out = BIO_new(BIO_s_file());
+ if (out == NULL) {
+ return 1;
+ }
+
+ if (outfile == NULL) {
+ BIO_set_fp(out, stdout, BIO_NOCLOSE);
+ } else {
+ if (!BIO_write_filename(out, outfile)) {
+ perror(outfile);
+ return 1;
+ }
+ }
+
+ if (!results)
+ BIO_puts(out, "obase=16\nibase=16\n");
+
+ message(out, "BN_add");
+ if (!test_add(out))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_sub");
+ if (!test_sub(out))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_lshift1");
+ if (!test_lshift1(out))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_lshift (fixed)");
+ if (!test_lshift(out, ctx, BN_bin2bn(lst, sizeof(lst) - 1, NULL)))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_lshift");
+ if (!test_lshift(out, ctx, NULL))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_rshift1");
+ if (!test_rshift1(out))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_rshift");
+ if (!test_rshift(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_sqr");
+ if (!test_sqr(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_mul");
+ if (!test_mul(out))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_div");
+ if (!test_div(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_div_word");
+ if (!test_div_word(out))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_mod");
+ if (!test_mod(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_mod_mul");
+ if (!test_mod_mul(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_mont");
+ if (!test_mont(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_mod_exp");
+ if (!test_mod_exp(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_mod_exp_mont_consttime");
+ if (!test_mod_exp_mont_consttime(out, ctx) ||
+ !test_mod_exp_mont5(out, ctx)) {
+ goto err;
+ }
+ (void)BIO_flush(out);
+
+ message(out, "BN_exp");
+ if (!test_exp(out, ctx) ||
+ !test_exp_mod_zero()) {
+ goto err;
+ }
+ (void)BIO_flush(out);
+
+ message(out, "BN_mod_sqrt");
+ if (!test_mod_sqrt(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "Small prime generation");
+ if (!test_small_prime(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_sqrt");
+ if (!test_sqrt(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ message(out, "BN_bn2bin_padded");
+ if (!test_bn2bin_padded(out, ctx))
+ goto err;
+ (void)BIO_flush(out);
+
+ BN_CTX_free(ctx);
+ BIO_free(out);
+
+ printf("PASS\n");
+ return 0;
+
+err:
+ BIO_puts(out, "1\n"); /* make sure the Perl script fed by bc notices
+ * the failure, see test_bn in test/Makefile.ssl*/
+ (void)BIO_flush(out);
+
+ return 1;
+}
+
+int test_add(BIO *bp) {
+ BIGNUM a, b, c;
+ int i;
+
+ BN_init(&a);
+ BN_init(&b);
+ BN_init(&c);
+
+ BN_rand(&a, 512, 0, 0);
+ for (i = 0; i < num0; i++) {
+ BN_rand(&b, 450 + i, 0, 0);
+ a.neg = rand_neg();
+ b.neg = rand_neg();
+ BN_add(&c, &a, &b);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, &a);
+ BIO_puts(bp, " + ");
+ BN_print(bp, &b);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, &c);
+ BIO_puts(bp, "\n");
+ }
+ a.neg = !a.neg;
+ b.neg = !b.neg;
+ BN_add(&c, &c, &b);
+ BN_add(&c, &c, &a);
+ if (!BN_is_zero(&c)) {
+ fprintf(stderr, "Add test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(&a);
+ BN_free(&b);
+ BN_free(&c);
+ return (1);
+}
+
+int test_sub(BIO *bp) {
+ BIGNUM a, b, c;
+ int i;
+
+ BN_init(&a);
+ BN_init(&b);
+ BN_init(&c);
+
+ for (i = 0; i < num0 + num1; i++) {
+ if (i < num1) {
+ BN_rand(&a, 512, 0, 0);
+ BN_copy(&b, &a);
+ if (BN_set_bit(&a, i) == 0)
+ return (0);
+ BN_add_word(&b, i);
+ } else {
+ BN_rand(&b, 400 + i - num1, 0, 0);
+ a.neg = rand_neg();
+ b.neg = rand_neg();
+ }
+ BN_sub(&c, &a, &b);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, &a);
+ BIO_puts(bp, " - ");
+ BN_print(bp, &b);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, &c);
+ BIO_puts(bp, "\n");
+ }
+ BN_add(&c, &c, &b);
+ BN_sub(&c, &c, &a);
+ if (!BN_is_zero(&c)) {
+ fprintf(stderr, "Subtract test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(&a);
+ BN_free(&b);
+ BN_free(&c);
+ return (1);
+}
+
+int test_div(BIO *bp, BN_CTX *ctx) {
+ BIGNUM a, b, c, d, e;
+ int i;
+
+ BN_init(&a);
+ BN_init(&b);
+ BN_init(&c);
+ BN_init(&d);
+ BN_init(&e);
+
+ for (i = 0; i < num0 + num1; i++) {
+ if (i < num1) {
+ BN_rand(&a, 400, 0, 0);
+ BN_copy(&b, &a);
+ BN_lshift(&a, &a, i);
+ BN_add_word(&a, i);
+ } else
+ BN_rand(&b, 50 + 3 * (i - num1), 0, 0);
+ a.neg = rand_neg();
+ b.neg = rand_neg();
+ BN_div(&d, &c, &a, &b, ctx);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, &a);
+ BIO_puts(bp, " / ");
+ BN_print(bp, &b);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, &d);
+ BIO_puts(bp, "\n");
+
+ if (!results) {
+ BN_print(bp, &a);
+ BIO_puts(bp, " % ");
+ BN_print(bp, &b);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, &c);
+ BIO_puts(bp, "\n");
+ }
+ BN_mul(&e, &d, &b, ctx);
+ BN_add(&d, &e, &c);
+ BN_sub(&d, &d, &a);
+ if (!BN_is_zero(&d)) {
+ fprintf(stderr, "Division test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(&a);
+ BN_free(&b);
+ BN_free(&c);
+ BN_free(&d);
+ BN_free(&e);
+ return (1);
+}
+
+int test_lshift1(BIO *bp) {
+ BIGNUM *a, *b, *c;
+ int i;
+
+ a = BN_new();
+ b = BN_new();
+ c = BN_new();
+
+ BN_rand(a, 200, 0, 0); /**/
+ a->neg = rand_neg();
+ for (i = 0; i < num0; i++) {
+ BN_lshift1(b, a);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " * 2");
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, b);
+ BIO_puts(bp, "\n");
+ }
+ BN_add(c, a, a);
+ BN_sub(a, b, c);
+ if (!BN_is_zero(a)) {
+ fprintf(stderr, "Left shift one test failed!\n");
+ return 0;
+ }
+
+ BN_copy(a, b);
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ return (1);
+}
+
+int test_rshift(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *b, *c, *d, *e;
+ int i;
+
+ a = BN_new();
+ b = BN_new();
+ c = BN_new();
+ d = BN_new();
+ e = BN_new();
+ BN_one(c);
+
+ BN_rand(a, 200, 0, 0); /**/
+ a->neg = rand_neg();
+ for (i = 0; i < num0; i++) {
+ BN_rshift(b, a, i + 1);
+ BN_add(c, c, c);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " / ");
+ BN_print(bp, c);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, b);
+ BIO_puts(bp, "\n");
+ }
+ BN_div(d, e, a, c, ctx);
+ BN_sub(d, d, b);
+ if (!BN_is_zero(d)) {
+ fprintf(stderr, "Right shift test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ BN_free(d);
+ BN_free(e);
+ return (1);
+}
+
+int test_rshift1(BIO *bp) {
+ BIGNUM *a, *b, *c;
+ int i;
+
+ a = BN_new();
+ b = BN_new();
+ c = BN_new();
+
+ BN_rand(a, 200, 0, 0); /**/
+ a->neg = rand_neg();
+ for (i = 0; i < num0; i++) {
+ BN_rshift1(b, a);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " / 2");
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, b);
+ BIO_puts(bp, "\n");
+ }
+ BN_sub(c, a, b);
+ BN_sub(c, c, b);
+ if (!BN_is_zero(c) && !BN_abs_is_word(c, 1)) {
+ fprintf(stderr, "Right shift one test failed!\n");
+ return 0;
+ }
+ BN_copy(a, b);
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ return (1);
+}
+
+int test_lshift(BIO *bp, BN_CTX *ctx, BIGNUM *a_) {
+ BIGNUM *a, *b, *c, *d;
+ int i;
+
+ b = BN_new();
+ c = BN_new();
+ d = BN_new();
+ BN_one(c);
+
+ if (a_)
+ a = a_;
+ else {
+ a = BN_new();
+ BN_rand(a, 200, 0, 0); /**/
+ a->neg = rand_neg();
+ }
+ for (i = 0; i < num0; i++) {
+ BN_lshift(b, a, i + 1);
+ BN_add(c, c, c);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " * ");
+ BN_print(bp, c);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, b);
+ BIO_puts(bp, "\n");
+ }
+ BN_mul(d, a, c, ctx);
+ BN_sub(d, d, b);
+ if (!BN_is_zero(d)) {
+ fprintf(stderr, "Left shift test failed!\n");
+ fprintf(stderr, "a=");
+ BN_print_fp(stderr, a);
+ fprintf(stderr, "\nb=");
+ BN_print_fp(stderr, b);
+ fprintf(stderr, "\nc=");
+ BN_print_fp(stderr, c);
+ fprintf(stderr, "\nd=");
+ BN_print_fp(stderr, d);
+ fprintf(stderr, "\n");
+ return 0;
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ BN_free(d);
+ return (1);
+}
+
+int test_mul(BIO *bp) {
+ BIGNUM a, b, c, d, e;
+ int i;
+ BN_CTX *ctx;
+
+ ctx = BN_CTX_new();
+ if (ctx == NULL)
+ abort();
+
+ BN_init(&a);
+ BN_init(&b);
+ BN_init(&c);
+ BN_init(&d);
+ BN_init(&e);
+
+ for (i = 0; i < num0 + num1; i++) {
+ if (i <= num1) {
+ BN_rand(&a, 100, 0, 0);
+ BN_rand(&b, 100, 0, 0);
+ } else
+ BN_rand(&b, i - num1, 0, 0);
+ a.neg = rand_neg();
+ b.neg = rand_neg();
+ BN_mul(&c, &a, &b, ctx);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, &a);
+ BIO_puts(bp, " * ");
+ BN_print(bp, &b);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, &c);
+ BIO_puts(bp, "\n");
+ }
+ BN_div(&d, &e, &c, &a, ctx);
+ BN_sub(&d, &d, &b);
+ if (!BN_is_zero(&d) || !BN_is_zero(&e)) {
+ fprintf(stderr, "Multiplication test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(&a);
+ BN_free(&b);
+ BN_free(&c);
+ BN_free(&d);
+ BN_free(&e);
+ BN_CTX_free(ctx);
+ return (1);
+}
+
+int test_sqr(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *c, *d, *e;
+ int i, ret = 0;
+
+ a = BN_new();
+ c = BN_new();
+ d = BN_new();
+ e = BN_new();
+ if (a == NULL || c == NULL || d == NULL || e == NULL) {
+ goto err;
+ }
+
+ for (i = 0; i < num0; i++) {
+ BN_rand(a, 40 + i * 10, 0, 0);
+ a->neg = rand_neg();
+ BN_sqr(c, a, ctx);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " * ");
+ BN_print(bp, a);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, c);
+ BIO_puts(bp, "\n");
+ }
+ BN_div(d, e, c, a, ctx);
+ BN_sub(d, d, a);
+ if (!BN_is_zero(d) || !BN_is_zero(e)) {
+ fprintf(stderr, "Square test failed!\n");
+ goto err;
+ }
+ }
+
+ /* Regression test for a BN_sqr overflow bug. */
+ BN_hex2bn(&a,
+ "80000000000000008000000000000001FFFFFFFFFFFFFFFE0000000000000000");
+ BN_sqr(c, a, ctx);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " * ");
+ BN_print(bp, a);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, c);
+ BIO_puts(bp, "\n");
+ }
+ BN_mul(d, a, a, ctx);
+ if (BN_cmp(c, d)) {
+ fprintf(stderr,
+ "Square test failed: BN_sqr and BN_mul produce "
+ "different results!\n");
+ goto err;
+ }
+
+ /* Regression test for a BN_sqr overflow bug. */
+ BN_hex2bn(&a,
+ "80000000000000000000000080000001FFFFFFFE000000000000000000000000");
+ BN_sqr(c, a, ctx);
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " * ");
+ BN_print(bp, a);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, c);
+ BIO_puts(bp, "\n");
+ }
+ BN_mul(d, a, a, ctx);
+ if (BN_cmp(c, d)) {
+ fprintf(stderr,
+ "Square test failed: BN_sqr and BN_mul produce "
+ "different results!\n");
+ goto err;
+ }
+ ret = 1;
+
+err:
+ if (a != NULL) {
+ BN_free(a);
+ }
+ if (c != NULL) {
+ BN_free(c);
+ }
+ if (d != NULL) {
+ BN_free(d);
+ }
+ if (e != NULL) {
+ BN_free(e);
+ }
+ return ret;
+}
+
+
+int rand_neg(void) {
+ static unsigned int neg = 0;
+ static int sign[8] = {0, 0, 0, 1, 1, 0, 1, 1};
+
+ return (sign[(neg++) % 8]);
+}
+
+static void print_word(BIO *bp, BN_ULONG w) {
+ BIO_printf(bp, BN_HEX_FMT1, w);
+}
+
+int test_div_word(BIO *bp) {
+ BIGNUM a, b;
+ BN_ULONG r, s;
+ int i;
+
+ BN_init(&a);
+ BN_init(&b);
+
+ for (i = 0; i < num0; i++) {
+ do {
+ BN_rand(&a, 512, -1, 0);
+ BN_rand(&b, BN_BITS2, -1, 0);
+ s = b.d[0];
+ } while (!s);
+
+ BN_copy(&b, &a);
+ r = BN_div_word(&b, s);
+
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, &a);
+ BIO_puts(bp, " / ");
+ print_word(bp, s);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, &b);
+ BIO_puts(bp, "\n");
+
+ if (!results) {
+ BN_print(bp, &a);
+ BIO_puts(bp, " % ");
+ print_word(bp, s);
+ BIO_puts(bp, " - ");
+ }
+ print_word(bp, r);
+ BIO_puts(bp, "\n");
+ }
+ BN_mul_word(&b, s);
+ BN_add_word(&b, r);
+ BN_sub(&b, &a, &b);
+ if (!BN_is_zero(&b)) {
+ fprintf(stderr, "Division (word) test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(&a);
+ BN_free(&b);
+ return (1);
+}
+
+int test_mont(BIO *bp, BN_CTX *ctx) {
+ BIGNUM a, b, c, d, A, B;
+ BIGNUM n;
+ int i;
+ BN_MONT_CTX *mont;
+
+ BN_init(&a);
+ BN_init(&b);
+ BN_init(&c);
+ BN_init(&d);
+ BN_init(&A);
+ BN_init(&B);
+ BN_init(&n);
+
+ mont = BN_MONT_CTX_new();
+ if (mont == NULL)
+ return 0;
+
+ BN_rand(&a, 100, 0, 0); /**/
+ BN_rand(&b, 100, 0, 0); /**/
+ for (i = 0; i < num2; i++) {
+ int bits = (200 * (i + 1)) / num2;
+
+ if (bits == 0)
+ continue;
+ BN_rand(&n, bits, 0, 1);
+ BN_MONT_CTX_set(mont, &n, ctx);
+
+ BN_nnmod(&a, &a, &n, ctx);
+ BN_nnmod(&b, &b, &n, ctx);
+
+ BN_to_montgomery(&A, &a, mont, ctx);
+ BN_to_montgomery(&B, &b, mont, ctx);
+
+ BN_mod_mul_montgomery(&c, &A, &B, mont, ctx); /**/
+ BN_from_montgomery(&A, &c, mont, ctx); /**/
+ if (bp != NULL) {
+ if (!results) {
+#ifdef undef
+ fprintf(stderr, "%d * %d %% %d\n", BN_num_bits(&a), BN_num_bits(&b),
+ BN_num_bits(mont->N));
+#endif
+ BN_print(bp, &a);
+ BIO_puts(bp, " * ");
+ BN_print(bp, &b);
+ BIO_puts(bp, " % ");
+ BN_print(bp, &(mont->N));
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, &A);
+ BIO_puts(bp, "\n");
+ }
+ BN_mod_mul(&d, &a, &b, &n, ctx);
+ BN_sub(&d, &d, &A);
+ if (!BN_is_zero(&d)) {
+ fprintf(stderr, "Montgomery multiplication test failed!\n");
+ return 0;
+ }
+ }
+ BN_MONT_CTX_free(mont);
+ BN_free(&a);
+ BN_free(&b);
+ BN_free(&c);
+ BN_free(&d);
+ BN_free(&A);
+ BN_free(&B);
+ BN_free(&n);
+ return (1);
+}
+
+int test_mod(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *b, *c, *d, *e;
+ int i;
+
+ a = BN_new();
+ b = BN_new();
+ c = BN_new();
+ d = BN_new();
+ e = BN_new();
+
+ BN_rand(a, 1024, 0, 0); /**/
+ for (i = 0; i < num0; i++) {
+ BN_rand(b, 450 + i * 10, 0, 0); /**/
+ a->neg = rand_neg();
+ b->neg = rand_neg();
+ BN_mod(c, a, b, ctx); /**/
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " % ");
+ BN_print(bp, b);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, c);
+ BIO_puts(bp, "\n");
+ }
+ BN_div(d, e, a, b, ctx);
+ BN_sub(e, e, c);
+ if (!BN_is_zero(e)) {
+ fprintf(stderr, "Modulo test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ BN_free(d);
+ BN_free(e);
+ return (1);
+}
+
+int test_mod_mul(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *b, *c, *d, *e;
+ int i, j;
+
+ a = BN_new();
+ b = BN_new();
+ c = BN_new();
+ d = BN_new();
+ e = BN_new();
+
+ for (j = 0; j < 3; j++) {
+ BN_rand(c, 1024, 0, 0); /**/
+ for (i = 0; i < num0; i++) {
+ BN_rand(a, 475 + i * 10, 0, 0); /**/
+ BN_rand(b, 425 + i * 11, 0, 0); /**/
+ a->neg = rand_neg();
+ b->neg = rand_neg();
+ if (!BN_mod_mul(e, a, b, c, ctx)) {
+ unsigned long l;
+
+ while ((l = ERR_get_error()))
+ fprintf(stderr, "ERROR:%s\n", ERR_error_string(l, NULL));
+ abort();
+ }
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " * ");
+ BN_print(bp, b);
+ BIO_puts(bp, " % ");
+ BN_print(bp, c);
+ if ((a->neg ^ b->neg) && !BN_is_zero(e)) {
+ /* If (a*b) % c is negative, c must be added
+ * in order to obtain the normalized remainder
+ * (new with OpenSSL 0.9.7, previous versions of
+ * BN_mod_mul could generate negative results)
+ */
+ BIO_puts(bp, " + ");
+ BN_print(bp, c);
+ }
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, e);
+ BIO_puts(bp, "\n");
+ }
+ BN_mul(d, a, b, ctx);
+ BN_sub(d, d, e);
+ BN_div(a, b, d, c, ctx);
+ if (!BN_is_zero(b)) {
+ fprintf(stderr, "Modulo multiply test failed!\n");
+ ERR_print_errors_fp(stderr);
+ return 0;
+ }
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ BN_free(d);
+ BN_free(e);
+ return (1);
+}
+
+int test_mod_exp(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *b, *c, *d, *e;
+ int i;
+
+ a = BN_new();
+ b = BN_new();
+ c = BN_new();
+ d = BN_new();
+ e = BN_new();
+
+ BN_rand(c, 30, 0, 1); /* must be odd for montgomery */
+ for (i = 0; i < num2; i++) {
+ BN_rand(a, 20 + i * 5, 0, 0); /**/
+ BN_rand(b, 2 + i, 0, 0); /**/
+
+ if (!BN_mod_exp(d, a, b, c, ctx))
+ return (0);
+
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " ^ ");
+ BN_print(bp, b);
+ BIO_puts(bp, " % ");
+ BN_print(bp, c);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, d);
+ BIO_puts(bp, "\n");
+ }
+ BN_exp(e, a, b, ctx);
+ BN_sub(e, e, d);
+ BN_div(a, b, e, c, ctx);
+ if (!BN_is_zero(b)) {
+ fprintf(stderr, "Modulo exponentiation test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ BN_free(d);
+ BN_free(e);
+ return (1);
+}
+
+int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *b, *c, *d, *e;
+ int i;
+
+ a = BN_new();
+ b = BN_new();
+ c = BN_new();
+ d = BN_new();
+ e = BN_new();
+
+ BN_rand(c, 30, 0, 1); /* must be odd for montgomery */
+ for (i = 0; i < num2; i++) {
+ BN_rand(a, 20 + i * 5, 0, 0); /**/
+ BN_rand(b, 2 + i, 0, 0); /**/
+
+ if (!BN_mod_exp_mont_consttime(d, a, b, c, ctx, NULL))
+ return (00);
+
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " ^ ");
+ BN_print(bp, b);
+ BIO_puts(bp, " % ");
+ BN_print(bp, c);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, d);
+ BIO_puts(bp, "\n");
+ }
+ BN_exp(e, a, b, ctx);
+ BN_sub(e, e, d);
+ BN_div(a, b, e, c, ctx);
+ if (!BN_is_zero(b)) {
+ fprintf(stderr, "Modulo exponentiation test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ BN_free(d);
+ BN_free(e);
+ return (1);
+}
+
+/* Test constant-time modular exponentiation with 1024-bit inputs,
+ * which on x86_64 cause a different code branch to be taken. */
+int test_mod_exp_mont5(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *p, *m, *d, *e;
+
+ BN_MONT_CTX *mont;
+
+ a = BN_new();
+ p = BN_new();
+ m = BN_new();
+ d = BN_new();
+ e = BN_new();
+
+ mont = BN_MONT_CTX_new();
+
+ BN_rand(m, 1024, 0, 1); /* must be odd for montgomery */
+ /* Zero exponent */
+ BN_rand(a, 1024, 0, 0);
+ BN_zero(p);
+ if (!BN_mod_exp_mont_consttime(d, a, p, m, ctx, NULL))
+ return 0;
+ if (!BN_is_one(d)) {
+ fprintf(stderr, "Modular exponentiation test failed!\n");
+ return 0;
+ }
+ /* Zero input */
+ BN_rand(p, 1024, 0, 0);
+ BN_zero(a);
+ if (!BN_mod_exp_mont_consttime(d, a, p, m, ctx, NULL))
+ return 0;
+ if (!BN_is_zero(d)) {
+ fprintf(stderr, "Modular exponentiation test failed!\n");
+ return 0;
+ }
+ /* Craft an input whose Montgomery representation is 1,
+ * i.e., shorter than the modulus m, in order to test
+ * the const time precomputation scattering/gathering.
+ */
+ BN_one(a);
+ BN_MONT_CTX_set(mont, m, ctx);
+ if (!BN_from_montgomery(e, a, mont, ctx) ||
+ !BN_mod_exp_mont_consttime(d, e, p, m, ctx, NULL) ||
+ !BN_mod_exp(a, e, p, m, ctx)) {
+ return 0;
+ }
+ if (BN_cmp(a, d) != 0) {
+ fprintf(stderr, "Modular exponentiation test failed!\n");
+ return 0;
+ }
+ /* Finally, some regular test vectors. */
+ BN_rand(e, 1024, 0, 0);
+ if (!BN_mod_exp_mont_consttime(d, e, p, m, ctx, NULL))
+ return 0;
+ if (!BN_mod_exp(a, e, p, m, ctx))
+ return 0;
+ if (BN_cmp(a, d) != 0) {
+ fprintf(stderr, "Modular exponentiation test failed!\n");
+ return 0;
+ }
+
+ BN_MONT_CTX_free(mont);
+ BN_free(a);
+ BN_free(p);
+ BN_free(m);
+ BN_free(d);
+ BN_free(e);
+ return (1);
+}
+
+int test_exp(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *a, *b, *d, *e, *one;
+ int i;
+
+ a = BN_new();
+ b = BN_new();
+ d = BN_new();
+ e = BN_new();
+ one = BN_new();
+ BN_one(one);
+
+ for (i = 0; i < num2; i++) {
+ BN_rand(a, 20 + i * 5, 0, 0); /**/
+ BN_rand(b, 2 + i, 0, 0); /**/
+
+ if (BN_exp(d, a, b, ctx) <= 0)
+ return (0);
+
+ if (bp != NULL) {
+ if (!results) {
+ BN_print(bp, a);
+ BIO_puts(bp, " ^ ");
+ BN_print(bp, b);
+ BIO_puts(bp, " - ");
+ }
+ BN_print(bp, d);
+ BIO_puts(bp, "\n");
+ }
+ BN_one(e);
+ for (; !BN_is_zero(b); BN_sub(b, b, one))
+ BN_mul(e, e, a, ctx);
+ BN_sub(e, e, d);
+ if (!BN_is_zero(e)) {
+ fprintf(stderr, "Exponentiation test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(d);
+ BN_free(e);
+ BN_free(one);
+ return (1);
+}
+
+/* test_exp_mod_zero tests that x**0 mod 1 == 0. */
+static int test_exp_mod_zero(void) {
+ BIGNUM a, p, m;
+ BIGNUM r;
+ BN_CTX *ctx = BN_CTX_new();
+ int ret = 0;
+
+ BN_init(&m);
+ BN_one(&m);
+
+ BN_init(&a);
+ BN_one(&a);
+
+ BN_init(&p);
+ BN_zero(&p);
+
+ BN_init(&r);
+ BN_mod_exp(&r, &a, &p, &m, ctx);
+ BN_CTX_free(ctx);
+
+ if (BN_is_zero(&r)) {
+ ret = 1;
+ } else {
+ printf("1**0 mod 1 = ");
+ BN_print_fp(stdout, &r);
+ printf(", should be 0\n");
+ }
+
+ BN_free(&r);
+ BN_free(&a);
+ BN_free(&p);
+ BN_free(&m);
+
+ return ret;
+}
+
+static int genprime_cb(int p, int n, BN_GENCB *arg) {
+ char c = '*';
+
+ if (p == 0)
+ c = '.';
+ if (p == 1)
+ c = '+';
+ if (p == 2)
+ c = '*';
+ if (p == 3)
+ c = '\n';
+ putc(c, stdout);
+ fflush(stdout);
+ return 1;
+}
+
+int test_mod_sqrt(BIO *bp, BN_CTX *ctx) {
+ BN_GENCB cb;
+ BIGNUM *a, *p, *r;
+ int i, j;
+ int ret = 0;
+
+ a = BN_new();
+ p = BN_new();
+ r = BN_new();
+ if (a == NULL || p == NULL || r == NULL)
+ goto err;
+
+ BN_GENCB_set(&cb, genprime_cb, NULL);
+
+ for (i = 0; i < 16; i++) {
+ if (i < 8) {
+ unsigned primes[8] = {2, 3, 5, 7, 11, 13, 17, 19};
+
+ if (!BN_set_word(p, primes[i]))
+ goto err;
+ } else {
+ if (!BN_set_word(a, 32))
+ goto err;
+ if (!BN_set_word(r, 2 * i + 1))
+ goto err;
+
+ if (!BN_generate_prime_ex(p, 256, 0, a, r, &cb))
+ goto err;
+ putc('\n', stdout);
+ }
+ p->neg = rand_neg();
+
+ for (j = 0; j < num2; j++) {
+ /* construct 'a' such that it is a square modulo p,
+ * but in general not a proper square and not reduced modulo p */
+ if (!BN_rand(r, 256, 0, 3))
+ goto err;
+ if (!BN_nnmod(r, r, p, ctx))
+ goto err;
+ if (!BN_mod_sqr(r, r, p, ctx))
+ goto err;
+ if (!BN_rand(a, 256, 0, 3))
+ goto err;
+ if (!BN_nnmod(a, a, p, ctx))
+ goto err;
+ if (!BN_mod_sqr(a, a, p, ctx))
+ goto err;
+ if (!BN_mul(a, a, r, ctx))
+ goto err;
+ if (rand_neg())
+ if (!BN_sub(a, a, p))
+ goto err;
+
+ if (!BN_mod_sqrt(r, a, p, ctx))
+ goto err;
+ if (!BN_mod_sqr(r, r, p, ctx))
+ goto err;
+
+ if (!BN_nnmod(a, a, p, ctx))
+ goto err;
+
+ if (BN_cmp(a, r) != 0) {
+ fprintf(stderr, "BN_mod_sqrt failed: a = ");
+ BN_print_fp(stderr, a);
+ fprintf(stderr, ", r = ");
+ BN_print_fp(stderr, r);
+ fprintf(stderr, ", p = ");
+ BN_print_fp(stderr, p);
+ fprintf(stderr, "\n");
+ goto err;
+ }
+
+ putc('.', stdout);
+ fflush(stdout);
+ }
+
+ putc('\n', stdout);
+ fflush(stderr);
+ }
+ ret = 1;
+err:
+ if (a != NULL)
+ BN_free(a);
+ if (p != NULL)
+ BN_free(p);
+ if (r != NULL)
+ BN_free(r);
+ return ret;
+}
+
+int test_small_prime(BIO *bp, BN_CTX *ctx) {
+ static const int bits = 10;
+ int ret = 0;
+ BIGNUM r;
+
+ BN_init(&r);
+ if (!BN_generate_prime_ex(&r, bits, 0, NULL, NULL, NULL)) {
+ goto err;
+ }
+ if (BN_num_bits(&r) != bits) {
+ BIO_printf(bp, "Expected %d bit prime, got %d bit number\n", bits,
+ BN_num_bits(&r));
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ BN_free(&r);
+ return ret;
+}
+
+int test_sqrt(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *n = BN_new(), *nn = BN_new(), *sqrt = BN_new();
+ unsigned i;
+
+ /* Test some random squares. */
+ for (i = 0; i < 100; i++) {
+ if (!BN_rand(n, 1024 /* bit length */, -1 /* no modification of top bits */,
+ 0 /* don't modify bottom bit */) ||
+ !BN_mul(nn, n, n, ctx) ||
+ !BN_sqrt(sqrt, nn, ctx)) {
+ BIO_print_errors_fp(stderr);
+ return 0;
+ }
+ if (BN_cmp(n, sqrt) != 0) {
+ fprintf(stderr, "Bad result from BN_sqrt.\n");
+ return 0;
+ }
+ }
+
+ /* Test some non-squares */
+ for (i = 0; i < 100; i++) {
+ if (!BN_rand(n, 1024 /* bit length */, -1 /* no modification of top bits */,
+ 0 /* don't modify bottom bit */) ||
+ !BN_mul(nn, n, n, ctx) ||
+ !BN_add(nn, nn, BN_value_one())) {
+ BIO_print_errors_fp(stderr);
+ return 0;
+ }
+
+ if (BN_sqrt(sqrt, nn, ctx)) {
+ char *nn_str = BN_bn2dec(nn);
+ fprintf(stderr, "BIO_sqrt didn't fail on a non-square: %s\n", nn_str);
+ OPENSSL_free(nn_str);
+ }
+ }
+
+ BN_free(n);
+ BN_free(sqrt);
+ BN_free(nn);
+
+ return 1;
+}
+
+int test_bn2bin_padded(BIO *bp, BN_CTX *ctx) {
+ BIGNUM *n = BN_new();
+ uint8_t zeros[256], out[256], reference[128];
+ size_t bytes;
+
+ memset(zeros, 0, sizeof(zeros));
+
+ /* Test edge case at 0. */
+ if (!BN_bn2bin_padded(NULL, 0, n)) {
+ fprintf(stderr,
+ "BN_bn2bin_padded failed to encode 0 in an empty buffer.\n");
+ return 0;
+ }
+ memset(out, -1, sizeof(out));
+ if (!BN_bn2bin_padded(out, sizeof(out), n)) {
+ fprintf(stderr,
+ "BN_bn2bin_padded failed to encode 0 in a non-empty buffer.\n");
+ return 0;
+ }
+ if (memcmp(zeros, out, sizeof(out))) {
+ fprintf(stderr, "BN_bn2bin_padded did not zero buffer.\n");
+ return 0;
+ }
+
+ /* Test a random numbers at various byte lengths. */
+ for (bytes = 128 - 7; bytes <= 128; bytes++) {
+ if (!BN_rand(n, bytes * 8, 0 /* make sure top bit is 1 */,
+ 0 /* don't modify bottom bit */)) {
+ BIO_print_errors_fp(stderr);
+ return 0;
+ }
+ if (BN_num_bytes(n) != bytes || BN_bn2bin(n, reference) != bytes) {
+ fprintf(stderr, "Bad result from BN_rand; bytes.\n");
+ return 0;
+ }
+ /* Empty buffer should fail. */
+ if (BN_bn2bin_padded(NULL, 0, n)) {
+ fprintf(stderr,
+ "BN_bn2bin_padded incorrectly succeeded on empty buffer.\n");
+ return 0;
+ }
+ /* One byte short should fail. */
+ if (BN_bn2bin_padded(out, bytes - 1, n)) {
+ fprintf(stderr, "BN_bn2bin_padded incorrectly succeeded on short.\n");
+ return 0;
+ }
+ /* Exactly right size should encode. */
+ if (!BN_bn2bin_padded(out, bytes, n) ||
+ memcmp(out, reference, bytes) != 0) {
+ fprintf(stderr, "BN_bn2bin_padded gave a bad result.\n");
+ return 0;
+ }
+ /* Pad up one byte extra. */
+ if (!BN_bn2bin_padded(out, bytes + 1, n) ||
+ memcmp(out + 1, reference, bytes) || memcmp(out, zeros, 1)) {
+ fprintf(stderr, "BN_bn2bin_padded gave a bad result.\n");
+ return 0;
+ }
+ /* Pad up to 256. */
+ if (!BN_bn2bin_padded(out, sizeof(out), n) ||
+ memcmp(out + sizeof(out) - bytes, reference, bytes) ||
+ memcmp(out, zeros, sizeof(out) - bytes)) {
+ fprintf(stderr, "BN_bn2bin_padded gave a bad result.\n");
+ return 0;
+ }
+ }
+
+ BN_free(n);
+
+ return 1;
+}
diff --git a/src/crypto/bn/cmp.c b/src/crypto/bn/cmp.c
new file mode 100644
index 0000000..fce7233
--- /dev/null
+++ b/src/crypto/bn/cmp.c
@@ -0,0 +1,200 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include "internal.h"
+
+
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b) {
+ int i;
+ BN_ULONG t1, t2, *ap, *bp;
+
+ i = a->top - b->top;
+ if (i != 0) {
+ return i;
+ }
+
+ ap = a->d;
+ bp = b->d;
+ for (i = a->top - 1; i >= 0; i--) {
+ t1 = ap[i];
+ t2 = bp[i];
+ if (t1 != t2) {
+ return (t1 > t2) ? 1 : -1;
+ }
+ }
+
+ return 0;
+}
+
+int BN_cmp(const BIGNUM *a, const BIGNUM *b) {
+ int i;
+ int gt, lt;
+ BN_ULONG t1, t2;
+
+ if ((a == NULL) || (b == NULL)) {
+ if (a != NULL) {
+ return -1;
+ } else if (b != NULL) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ if (a->neg != b->neg) {
+ if (a->neg) {
+ return -1;
+ }
+ return 1;
+ }
+ if (a->neg == 0) {
+ gt = 1;
+ lt = -1;
+ } else {
+ gt = -1;
+ lt = 1;
+ }
+
+ if (a->top > b->top) {
+ return gt;
+ }
+ if (a->top < b->top) {
+ return lt;
+ }
+
+ for (i = a->top - 1; i >= 0; i--) {
+ t1 = a->d[i];
+ t2 = b->d[i];
+ if (t1 > t2) {
+ return gt;
+ } if (t1 < t2) {
+ return lt;
+ }
+ }
+
+ return 0;
+}
+
+int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n) {
+ int i;
+ BN_ULONG aa, bb;
+
+ aa = a[n - 1];
+ bb = b[n - 1];
+ if (aa != bb) {
+ return (aa > bb) ? 1 : -1;
+ }
+
+ for (i = n - 2; i >= 0; i--) {
+ aa = a[i];
+ bb = b[i];
+ if (aa != bb) {
+ return (aa > bb) ? 1 : -1;
+ }
+ }
+ return 0;
+}
+
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl) {
+ int n, i;
+ n = cl - 1;
+
+ if (dl < 0) {
+ for (i = dl; i < 0; i++) {
+ if (b[n - i] != 0) {
+ return -1; /* a < b */
+ }
+ }
+ }
+ if (dl > 0) {
+ for (i = dl; i > 0; i--) {
+ if (a[n + i] != 0) {
+ return 1; /* a > b */
+ }
+ }
+ }
+
+ return bn_cmp_words(a, b, cl);
+}
+
+int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) {
+ switch (bn->top) {
+ case 1:
+ return bn->d[0] == w;
+ case 0:
+ return w == 0;
+ default:
+ return 0;
+ }
+}
+
+int BN_is_zero(const BIGNUM *bn) {
+ return bn->top == 0;
+}
+
+int BN_is_one(const BIGNUM *bn) {
+ return bn->neg == 0 && BN_abs_is_word(bn, 1);
+}
+
+int BN_is_word(const BIGNUM *bn, BN_ULONG w) {
+ return BN_abs_is_word(bn, w) && (w == 0 || bn->neg == 0);
+}
+
+int BN_is_odd(const BIGNUM *bn) {
+ return bn->top > 0 && (bn->d[0] & 1) == 1;
+}
diff --git a/src/crypto/bn/convert.c b/src/crypto/bn/convert.c
new file mode 100644
index 0000000..f764eed
--- /dev/null
+++ b/src/crypto/bn/convert.c
@@ -0,0 +1,504 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
+ unsigned num_words, m;
+ BN_ULONG word = 0;
+ BIGNUM *bn = NULL;
+
+ if (ret == NULL) {
+ ret = bn = BN_new();
+ }
+
+ if (ret == NULL) {
+ return NULL;
+ }
+
+ if (len == 0) {
+ ret->top = 0;
+ return ret;
+ }
+
+ num_words = ((len - 1) / BN_BYTES) + 1;
+ m = (len - 1) % BN_BYTES;
+ if (bn_wexpand(ret, num_words) == NULL) {
+ if (bn) {
+ BN_free(bn);
+ }
+ return NULL;
+ }
+
+ ret->top = num_words;
+ ret->neg = 0;
+
+ while (len--) {
+ word = (word << 8) | *(in++);
+ if (m-- == 0) {
+ ret->d[--num_words] = word;
+ word = 0;
+ m = BN_BYTES - 1;
+ }
+ }
+
+ /* need to call this due to clear byte at top if avoiding having the top bit
+ * set (-ve number) */
+ bn_correct_top(ret);
+ return ret;
+}
+
+size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
+ size_t n, i;
+ BN_ULONG l;
+
+ n = i = BN_num_bytes(in);
+ while (i--) {
+ l = in->d[i / BN_BYTES];
+ *(out++) = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
+ }
+ return n;
+}
+
+/* constant_time_select_ulong returns |x| if |v| is 1 and |y| if |v| is 0. Its
+ * behavior is undefined if |v| takes any other value. */
+static BN_ULONG constant_time_select_ulong(int v, BN_ULONG x, BN_ULONG y) {
+ BN_ULONG mask = v;
+ mask--;
+
+ return (~mask & x) | (mask & y);
+}
+
+/* constant_time_le_size_t returns 1 if |x| <= |y| and 0 otherwise. |x| and |y|
+ * must not have their MSBs set. */
+static int constant_time_le_size_t(size_t x, size_t y) {
+ return ((x - y - 1) >> (sizeof(size_t) * 8 - 1)) & 1;
+}
+
+/* read_word_padded returns the |i|'th word of |in|, if it is not out of
+ * bounds. Otherwise, it returns 0. It does so without branches on the size of
+ * |in|, however it necessarily does not have the same memory access pattern. If
+ * the access would be out of bounds, it reads the last word of |in|. |in| must
+ * not be zero. */
+static BN_ULONG read_word_padded(const BIGNUM *in, size_t i) {
+ /* Read |in->d[i]| if valid. Otherwise, read the last word. */
+ BN_ULONG l = in->d[constant_time_select_ulong(
+ constant_time_le_size_t(in->dmax, i), in->dmax - 1, i)];
+
+ /* Clamp to zero if above |d->top|. */
+ return constant_time_select_ulong(constant_time_le_size_t(in->top, i), 0, l);
+}
+
+int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
+ size_t i;
+ BN_ULONG l;
+
+ /* Special case for |in| = 0. Just branch as the probability is negligible. */
+ if (BN_is_zero(in)) {
+ memset(out, 0, len);
+ return 1;
+ }
+
+ /* Check if the integer is too big. This case can exit early in non-constant
+ * time. */
+ if ((size_t)in->top > (len + (BN_BYTES - 1)) / BN_BYTES) {
+ return 0;
+ }
+ if ((len % BN_BYTES) != 0) {
+ l = read_word_padded(in, len / BN_BYTES);
+ if (l >> (8 * (len % BN_BYTES)) != 0) {
+ return 0;
+ }
+ }
+
+ /* Write the bytes out one by one. Serialization is done without branching on
+ * the bits of |in| or on |in->top|, but if the routine would otherwise read
+ * out of bounds, the memory access pattern can't be fixed. However, for an
+ * RSA key of size a multiple of the word size, the probability of BN_BYTES
+ * leading zero octets is low.
+ *
+ * See Falko Stenzke, "Manger's Attack revisited", ICICS 2010. */
+ i = len;
+ while (i--) {
+ l = read_word_padded(in, i / BN_BYTES);
+ *(out++) = (uint8_t)(l >> (8 * (i % BN_BYTES))) & 0xff;
+ }
+ return 1;
+}
+
+static const char hextable[] = "0123456789abcdef";
+
+char *BN_bn2hex(const BIGNUM *bn) {
+ int i, j, v, z = 0;
+ char *buf;
+ char *p;
+
+ buf = (char *)OPENSSL_malloc(bn->top * BN_BYTES * 2 + 2);
+ if (buf == NULL) {
+ OPENSSL_PUT_ERROR(BN, BN_bn2hex, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ p = buf;
+ if (bn->neg) {
+ *(p++) = '-';
+ }
+
+ if (BN_is_zero(bn)) {
+ *(p++) = '0';
+ }
+
+ for (i = bn->top - 1; i >= 0; i--) {
+ for (j = BN_BITS2 - 8; j >= 0; j -= 8) {
+ /* strip leading zeros */
+ v = ((int)(bn->d[i] >> (long)j)) & 0xff;
+ if (z || v != 0) {
+ *(p++) = hextable[v >> 4];
+ *(p++) = hextable[v & 0x0f];
+ z = 1;
+ }
+ }
+ }
+ *p = '\0';
+
+ return buf;
+}
+
+/* decode_hex decodes |i| bytes of hex data from |in| and updates |bn|. */
+static void decode_hex(BIGNUM *bn, const char *in, int i) {
+ int h, m, j, k, c;
+ BN_ULONG l=0;
+
+ j = i; /* least significant 'hex' */
+ h = 0;
+ while (j > 0) {
+ m = ((BN_BYTES * 2) <= j) ? (BN_BYTES * 2) : j;
+ l = 0;
+ for (;;) {
+ c = in[j - m];
+ if ((c >= '0') && (c <= '9')) {
+ k = c - '0';
+ } else if ((c >= 'a') && (c <= 'f')) {
+ k = c - 'a' + 10;
+ } else if ((c >= 'A') && (c <= 'F')) {
+ k = c - 'A' + 10;
+ } else {
+ k = 0; /* paranoia */
+ }
+
+ l = (l << 4) | k;
+
+ if (--m <= 0) {
+ bn->d[h++] = l;
+ break;
+ }
+ }
+
+ j -= (BN_BYTES * 2);
+ }
+
+ bn->top = h;
+}
+
+/* decode_dec decodes |i| bytes of decimal data from |in| and updates |bn|. */
+static void decode_dec(BIGNUM *bn, const char *in, int i) {
+ int j;
+ BN_ULONG l = 0;
+
+ j = BN_DEC_NUM - (i % BN_DEC_NUM);
+ if (j == BN_DEC_NUM) {
+ j = 0;
+ }
+ l = 0;
+ while (*in) {
+ l *= 10;
+ l += *in - '0';
+ in++;
+ if (++j == BN_DEC_NUM) {
+ BN_mul_word(bn, BN_DEC_CONV);
+ BN_add_word(bn, l);
+ l = 0;
+ j = 0;
+ }
+ }
+}
+
+typedef void (*decode_func) (BIGNUM *bn, const char *in, int i);
+typedef int (*char_test_func) (int c);
+
+static int bn_x2bn(BIGNUM **outp, const char *in, decode_func decode, char_test_func want_char) {
+ BIGNUM *ret = NULL;
+ int neg = 0, i;
+ int num;
+
+ if (in == NULL || *in == 0) {
+ return 0;
+ }
+
+ if (*in == '-') {
+ neg = 1;
+ in++;
+ }
+
+ for (i = 0; want_char((unsigned char)in[i]); i++) {}
+
+ num = i + neg;
+ if (outp == NULL) {
+ return num;
+ }
+
+ /* in is the start of the hex digits, and it is 'i' long */
+ if (*outp == NULL) {
+ ret = BN_new();
+ if (ret == NULL) {
+ return 0;
+ }
+ } else {
+ ret = *outp;
+ BN_zero(ret);
+ }
+ ret->neg = neg;
+
+ /* i is the number of hex digests; */
+ if (bn_expand(ret, i * 4) == NULL) {
+ goto err;
+ }
+
+ decode(ret, in, i);
+
+ bn_correct_top(ret);
+
+ *outp = ret;
+ return num;
+
+err:
+ if (*outp == NULL) {
+ BN_free(ret);
+ }
+
+ return 0;
+}
+
+int BN_hex2bn(BIGNUM **outp, const char *in) {
+ return bn_x2bn(outp, in, decode_hex, isxdigit);
+}
+
+char *BN_bn2dec(const BIGNUM *a) {
+ int i = 0, num, ok = 0;
+ char *buf = NULL;
+ char *p;
+ BIGNUM *t = NULL;
+ BN_ULONG *bn_data = NULL, *lp;
+
+ /* get an upper bound for the length of the decimal integer
+ * num <= (BN_num_bits(a) + 1) * log(2)
+ * <= 3 * BN_num_bits(a) * 0.1001 + log(2) + 1 (rounding error)
+ * <= BN_num_bits(a)/10 + BN_num_bits/1000 + 1 + 1
+ */
+ i = BN_num_bits(a) * 3;
+ num = i / 10 + i / 1000 + 1 + 1;
+ bn_data =
+ (BN_ULONG *)OPENSSL_malloc((num / BN_DEC_NUM + 1) * sizeof(BN_ULONG));
+ buf = (char *)OPENSSL_malloc(num + 3);
+ if ((buf == NULL) || (bn_data == NULL)) {
+ OPENSSL_PUT_ERROR(BN, BN_bn2dec, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ t = BN_dup(a);
+ if (t == NULL) {
+ goto err;
+ }
+
+#define BUF_REMAIN (num + 3 - (size_t)(p - buf))
+ p = buf;
+ lp = bn_data;
+ if (BN_is_zero(t)) {
+ *(p++) = '0';
+ *(p++) = '\0';
+ } else {
+ if (BN_is_negative(t)) {
+ *p++ = '-';
+ }
+
+ while (!BN_is_zero(t)) {
+ *lp = BN_div_word(t, BN_DEC_CONV);
+ lp++;
+ }
+ lp--;
+ /* We now have a series of blocks, BN_DEC_NUM chars
+ * in length, where the last one needs truncation.
+ * The blocks need to be reversed in order. */
+ BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT1, *lp);
+ while (*p) {
+ p++;
+ }
+ while (lp != bn_data) {
+ lp--;
+ BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT2, *lp);
+ while (*p) {
+ p++;
+ }
+ }
+ }
+ ok = 1;
+
+err:
+ if (bn_data != NULL) {
+ OPENSSL_free(bn_data);
+ }
+ if (t != NULL) {
+ BN_free(t);
+ }
+ if (!ok && buf) {
+ OPENSSL_free(buf);
+ buf = NULL;
+ }
+
+ return buf;
+}
+
+int BN_dec2bn(BIGNUM **outp, const char *in) {
+ return bn_x2bn(outp, in, decode_dec, isdigit);
+}
+
+int BN_asc2bn(BIGNUM **outp, const char *in) {
+ const char *const orig_in = in;
+ if (*in == '-') {
+ in++;
+ }
+
+ if (in[0] == '0' && (in[1] == 'X' || in[1] == 'x')) {
+ if (!BN_hex2bn(outp, in+2)) {
+ return 0;
+ }
+ } else {
+ if (!BN_dec2bn(outp, in)) {
+ return 0;
+ }
+ }
+
+ if (*orig_in == '-') {
+ (*outp)->neg = 1;
+ }
+
+ return 1;
+}
+
+int BN_print(BIO *bp, const BIGNUM *a) {
+ int i, j, v, z = 0;
+ int ret = 0;
+
+ if (a->neg && BIO_write(bp, "-", 1) != 1) {
+ goto end;
+ }
+
+ if (BN_is_zero(a) && BIO_write(bp, "0", 1) != 1) {
+ goto end;
+ }
+
+ for (i = a->top - 1; i >= 0; i--) {
+ for (j = BN_BITS2 - 4; j >= 0; j -= 4) {
+ /* strip leading zeros */
+ v = ((int)(a->d[i] >> (long)j)) & 0x0f;
+ if (z || v != 0) {
+ if (BIO_write(bp, &hextable[v], 1) != 1) {
+ goto end;
+ }
+ z = 1;
+ }
+ }
+ }
+ ret = 1;
+
+end:
+ return ret;
+}
+
+int BN_print_fp(FILE *fp, const BIGNUM *a) {
+ BIO *b;
+ int ret;
+
+ b = BIO_new(BIO_s_file());
+ if (b == NULL) {
+ return 0;
+ }
+ BIO_set_fp(b, fp, BIO_NOCLOSE);
+ ret = BN_print(b, a);
+ BIO_free(b);
+
+ return ret;
+}
+
+BN_ULONG BN_get_word(const BIGNUM *bn) {
+ switch (bn->top) {
+ case 0:
+ return 0;
+ case 1:
+ return bn->d[0];
+ default:
+ return BN_MASK2;
+ }
+}
diff --git a/src/crypto/bn/ctx.c b/src/crypto/bn/ctx.c
new file mode 100644
index 0000000..e54007b
--- /dev/null
+++ b/src/crypto/bn/ctx.c
@@ -0,0 +1,315 @@
+/* Written by Ulf Moeller for the OpenSSL project. */
+/* ====================================================================
+ * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+
+#include <openssl/bn.h>
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+
+/* How many bignums are in each "pool item"; */
+#define BN_CTX_POOL_SIZE 16
+/* The stack frame info is resizing, set a first-time expansion size; */
+#define BN_CTX_START_FRAMES 32
+
+/* A bundle of bignums that can be linked with other bundles */
+typedef struct bignum_pool_item {
+ /* The bignum values */
+ BIGNUM vals[BN_CTX_POOL_SIZE];
+ /* Linked-list admin */
+ struct bignum_pool_item *prev, *next;
+} BN_POOL_ITEM;
+
+
+typedef struct bignum_pool {
+ /* Linked-list admin */
+ BN_POOL_ITEM *head, *current, *tail;
+ /* Stack depth and allocation size */
+ unsigned used, size;
+} BN_POOL;
+
+static void BN_POOL_init(BN_POOL *);
+static void BN_POOL_finish(BN_POOL *);
+static BIGNUM *BN_POOL_get(BN_POOL *);
+static void BN_POOL_release(BN_POOL *, unsigned int);
+
+/************/
+/* BN_STACK */
+/************/
+
+/* A wrapper to manage the "stack frames" */
+typedef struct bignum_ctx_stack {
+ /* Array of indexes into the bignum stack */
+ unsigned int *indexes;
+ /* Number of stack frames, and the size of the allocated array */
+ unsigned int depth, size;
+} BN_STACK;
+
+static void BN_STACK_init(BN_STACK *);
+static void BN_STACK_finish(BN_STACK *);
+static int BN_STACK_push(BN_STACK *, unsigned int);
+static unsigned int BN_STACK_pop(BN_STACK *);
+
+/**********/
+/* BN_CTX */
+/**********/
+
+/* The opaque BN_CTX type */
+struct bignum_ctx {
+ /* The bignum bundles */
+ BN_POOL pool;
+ /* The "stack frames", if you will */
+ BN_STACK stack;
+ /* The number of bignums currently assigned */
+ unsigned int used;
+ /* Depth of stack overflow */
+ int err_stack;
+ /* Block "gets" until an "end" (compatibility behaviour) */
+ int too_many;
+};
+
+BN_CTX *BN_CTX_new(void) {
+ BN_CTX *ret = OPENSSL_malloc(sizeof(BN_CTX));
+ if (!ret) {
+ OPENSSL_PUT_ERROR(BN, BN_CTX_new, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ /* Initialise the structure */
+ BN_POOL_init(&ret->pool);
+ BN_STACK_init(&ret->stack);
+ ret->used = 0;
+ ret->err_stack = 0;
+ ret->too_many = 0;
+ return ret;
+}
+
+void BN_CTX_free(BN_CTX *ctx) {
+ if (ctx == NULL) {
+ return;
+ }
+
+ BN_STACK_finish(&ctx->stack);
+ BN_POOL_finish(&ctx->pool);
+ OPENSSL_free(ctx);
+}
+
+void BN_CTX_start(BN_CTX *ctx) {
+ /* If we're already overflowing ... */
+ if (ctx->err_stack || ctx->too_many) {
+ ctx->err_stack++;
+ } else if (!BN_STACK_push(&ctx->stack, ctx->used)) {
+ /* (Try to) get a new frame pointer */
+ OPENSSL_PUT_ERROR(BN, BN_CTX_start, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+ ctx->err_stack++;
+ }
+}
+
+BIGNUM *BN_CTX_get(BN_CTX *ctx) {
+ BIGNUM *ret;
+ if (ctx->err_stack || ctx->too_many) {
+ return NULL;
+ }
+
+ ret = BN_POOL_get(&ctx->pool);
+ if (ret == NULL) {
+ /* Setting too_many prevents repeated "get" attempts from
+ * cluttering the error stack. */
+ ctx->too_many = 1;
+ OPENSSL_PUT_ERROR(BN, BN_CTX_get, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+ return NULL;
+ }
+
+ /* OK, make sure the returned bignum is "zero" */
+ BN_zero(ret);
+ ctx->used++;
+ return ret;
+}
+
+void BN_CTX_end(BN_CTX *ctx) {
+ if (ctx->err_stack) {
+ ctx->err_stack--;
+ } else {
+ unsigned int fp = BN_STACK_pop(&ctx->stack);
+ /* Does this stack frame have anything to release? */
+ if (fp < ctx->used) {
+ BN_POOL_release(&ctx->pool, ctx->used - fp);
+ }
+
+ ctx->used = fp;
+ /* Unjam "too_many" in case "get" had failed */
+ ctx->too_many = 0;
+ }
+}
+
+/************/
+/* BN_STACK */
+/************/
+
+static void BN_STACK_init(BN_STACK *st) {
+ st->indexes = NULL;
+ st->depth = st->size = 0;
+}
+
+static void BN_STACK_finish(BN_STACK *st) {
+ if (st->size)
+ OPENSSL_free(st->indexes);
+}
+
+static int BN_STACK_push(BN_STACK *st, unsigned int idx) {
+ if (st->depth == st->size)
+ /* Need to expand */
+ {
+ unsigned int newsize =
+ (st->size ? (st->size * 3 / 2) : BN_CTX_START_FRAMES);
+ unsigned int *newitems = OPENSSL_malloc(newsize * sizeof(unsigned int));
+ if (!newitems) {
+ return 0;
+ }
+ if (st->depth) {
+ memcpy(newitems, st->indexes, st->depth * sizeof(unsigned int));
+ }
+ if (st->size) {
+ OPENSSL_free(st->indexes);
+ }
+ st->indexes = newitems;
+ st->size = newsize;
+ }
+
+ st->indexes[(st->depth)++] = idx;
+ return 1;
+}
+
+static unsigned int BN_STACK_pop(BN_STACK *st) {
+ return st->indexes[--(st->depth)];
+}
+
+static void BN_POOL_init(BN_POOL *p) {
+ p->head = p->current = p->tail = NULL;
+ p->used = p->size = 0;
+}
+
+static void BN_POOL_finish(BN_POOL *p) {
+ while (p->head) {
+ unsigned int loop = 0;
+ BIGNUM *bn = p->head->vals;
+ while (loop++ < BN_CTX_POOL_SIZE) {
+ if (bn->d) {
+ BN_clear_free(bn);
+ }
+ bn++;
+ }
+
+ p->current = p->head->next;
+ OPENSSL_free(p->head);
+ p->head = p->current;
+ }
+}
+
+static BIGNUM *BN_POOL_get(BN_POOL *p) {
+ if (p->used == p->size) {
+ BIGNUM *bn;
+ unsigned int loop = 0;
+ BN_POOL_ITEM *item = OPENSSL_malloc(sizeof(BN_POOL_ITEM));
+ if (!item) {
+ return NULL;
+ }
+
+ /* Initialise the structure */
+ bn = item->vals;
+ while (loop++ < BN_CTX_POOL_SIZE) {
+ BN_init(bn++);
+ }
+
+ item->prev = p->tail;
+ item->next = NULL;
+ /* Link it in */
+ if (!p->head) {
+ p->head = p->current = p->tail = item;
+ } else {
+ p->tail->next = item;
+ p->tail = item;
+ p->current = item;
+ }
+
+ p->size += BN_CTX_POOL_SIZE;
+ p->used++;
+ /* Return the first bignum from the new pool */
+ return item->vals;
+ }
+
+ if (!p->used) {
+ p->current = p->head;
+ } else if ((p->used % BN_CTX_POOL_SIZE) == 0) {
+ p->current = p->current->next;
+ }
+
+ return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE);
+}
+
+static void BN_POOL_release(BN_POOL *p, unsigned int num) {
+ unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE;
+ p->used -= num;
+
+ while (num--) {
+ if (!offset) {
+ offset = BN_CTX_POOL_SIZE - 1;
+ p->current = p->current->prev;
+ } else {
+ offset--;
+ }
+ }
+}
diff --git a/src/crypto/bn/div.c b/src/crypto/bn/div.c
new file mode 100644
index 0000000..d65957a
--- /dev/null
+++ b/src/crypto/bn/div.c
@@ -0,0 +1,620 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <limits.h>
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+#define asm __asm__
+
+#if !defined(OPENSSL_NO_ASM)
+# if defined(__GNUC__) && __GNUC__>=2
+# if defined(OPENSSL_X86)
+ /*
+ * There were two reasons for implementing this template:
+ * - GNU C generates a call to a function (__udivdi3 to be exact)
+ * in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
+ * understand why...);
+ * - divl doesn't only calculate quotient, but also leaves
+ * remainder in %edx which we can definitely use here:-)
+ *
+ * <appro@fy.chalmers.se>
+ */
+#undef div_asm
+# define div_asm(n0,n1,d0) \
+ ({ asm volatile ( \
+ "divl %4" \
+ : "=a"(q), "=d"(rem) \
+ : "a"(n1), "d"(n0), "g"(d0) \
+ : "cc"); \
+ q; \
+ })
+# define REMAINDER_IS_ALREADY_CALCULATED
+# elif defined(OPENSSL_X86_64)
+ /*
+ * Same story here, but it's 128-bit by 64-bit division. Wow!
+ * <appro@fy.chalmers.se>
+ */
+# undef div_asm
+# define div_asm(n0,n1,d0) \
+ ({ asm volatile ( \
+ "divq %4" \
+ : "=a"(q), "=d"(rem) \
+ : "a"(n1), "d"(n0), "g"(d0) \
+ : "cc"); \
+ q; \
+ })
+# define REMAINDER_IS_ALREADY_CALCULATED
+# endif /* __<cpu> */
+# endif /* __GNUC__ */
+#endif /* OPENSSL_NO_ASM */
+
+/* BN_div computes dv := num / divisor, rounding towards
+ * zero, and sets up rm such that dv*divisor + rm = num holds.
+ * Thus:
+ * dv->neg == num->neg ^ divisor->neg (unless the result is zero)
+ * rm->neg == num->neg (unless the remainder is zero)
+ * If 'dv' or 'rm' is NULL, the respective value is not returned. */
+int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
+ BN_CTX *ctx) {
+ int norm_shift, i, loop;
+ BIGNUM *tmp, wnum, *snum, *sdiv, *res;
+ BN_ULONG *resp, *wnump;
+ BN_ULONG d0, d1;
+ int num_n, div_n;
+ int no_branch = 0;
+
+ /* Invalid zero-padding would have particularly bad consequences
+ * so don't just rely on bn_check_top() here */
+ if ((num->top > 0 && num->d[num->top - 1] == 0) ||
+ (divisor->top > 0 && divisor->d[divisor->top - 1] == 0)) {
+ OPENSSL_PUT_ERROR(BN, BN_div, BN_R_NOT_INITIALIZED);
+ return 0;
+ }
+
+ if ((num->flags & BN_FLG_CONSTTIME) != 0 ||
+ (divisor->flags & BN_FLG_CONSTTIME) != 0) {
+ no_branch = 1;
+ }
+
+ if (BN_is_zero(divisor)) {
+ OPENSSL_PUT_ERROR(BN, BN_div, BN_R_DIV_BY_ZERO);
+ return 0;
+ }
+
+ if (!no_branch && BN_ucmp(num, divisor) < 0) {
+ if (rm != NULL) {
+ if (BN_copy(rm, num) == NULL) {
+ return 0;
+ }
+ }
+ if (dv != NULL) {
+ BN_zero(dv);
+ }
+ return 1;
+ }
+
+ BN_CTX_start(ctx);
+ tmp = BN_CTX_get(ctx);
+ snum = BN_CTX_get(ctx);
+ sdiv = BN_CTX_get(ctx);
+ if (dv == NULL) {
+ res = BN_CTX_get(ctx);
+ } else {
+ res = dv;
+ }
+ if (sdiv == NULL || res == NULL || tmp == NULL || snum == NULL) {
+ goto err;
+ }
+
+ /* First we normalise the numbers */
+ norm_shift = BN_BITS2 - ((BN_num_bits(divisor)) % BN_BITS2);
+ if (!(BN_lshift(sdiv, divisor, norm_shift))) {
+ goto err;
+ }
+ sdiv->neg = 0;
+ norm_shift += BN_BITS2;
+ if (!(BN_lshift(snum, num, norm_shift))) {
+ goto err;
+ }
+ snum->neg = 0;
+
+ if (no_branch) {
+ /* Since we don't know whether snum is larger than sdiv,
+ * we pad snum with enough zeroes without changing its
+ * value.
+ */
+ if (snum->top <= sdiv->top + 1) {
+ if (bn_wexpand(snum, sdiv->top + 2) == NULL) {
+ goto err;
+ }
+ for (i = snum->top; i < sdiv->top + 2; i++) {
+ snum->d[i] = 0;
+ }
+ snum->top = sdiv->top + 2;
+ } else {
+ if (bn_wexpand(snum, snum->top + 1) == NULL) {
+ goto err;
+ }
+ snum->d[snum->top] = 0;
+ snum->top++;
+ }
+ }
+
+ div_n = sdiv->top;
+ num_n = snum->top;
+ loop = num_n - div_n;
+ /* Lets setup a 'window' into snum
+ * This is the part that corresponds to the current
+ * 'area' being divided */
+ wnum.neg = 0;
+ wnum.d = &(snum->d[loop]);
+ wnum.top = div_n;
+ /* only needed when BN_ucmp messes up the values between top and max */
+ wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
+
+ /* Get the top 2 words of sdiv */
+ /* div_n=sdiv->top; */
+ d0 = sdiv->d[div_n - 1];
+ d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
+
+ /* pointer to the 'top' of snum */
+ wnump = &(snum->d[num_n - 1]);
+
+ /* Setup to 'res' */
+ res->neg = (num->neg ^ divisor->neg);
+ if (!bn_wexpand(res, (loop + 1))) {
+ goto err;
+ }
+ res->top = loop - no_branch;
+ resp = &(res->d[loop - 1]);
+
+ /* space for temp */
+ if (!bn_wexpand(tmp, (div_n + 1))) {
+ goto err;
+ }
+
+ if (!no_branch) {
+ if (BN_ucmp(&wnum, sdiv) >= 0) {
+ bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+ *resp = 1;
+ } else {
+ res->top--;
+ }
+ }
+
+ /* if res->top == 0 then clear the neg value otherwise decrease
+ * the resp pointer */
+ if (res->top == 0) {
+ res->neg = 0;
+ } else {
+ resp--;
+ }
+
+ for (i = 0; i < loop - 1; i++, wnump--, resp--) {
+ BN_ULONG q, l0;
+ /* the first part of the loop uses the top two words of snum and sdiv to
+ * calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv */
+ BN_ULONG n0, n1, rem = 0;
+
+ n0 = wnump[0];
+ n1 = wnump[-1];
+ if (n0 == d0) {
+ q = BN_MASK2;
+ } else {
+ /* n0 < d0 */
+#ifdef BN_LLONG
+ BN_ULLONG t2;
+
+#if defined(BN_LLONG) && !defined(div_asm)
+ q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2) | n1) / d0);
+#else
+ q = div_asm(n0, n1, d0);
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+ /* rem doesn't have to be BN_ULLONG. The least we know it's less that d0,
+ * isn't it? */
+ rem = (n1 - q * d0) & BN_MASK2;
+#endif
+
+ t2 = (BN_ULLONG)d1 * q;
+
+ for (;;) {
+ if (t2 <= ((((BN_ULLONG)rem) << BN_BITS2) | wnump[-2]))
+ break;
+ q--;
+ rem += d0;
+ if (rem < d0)
+ break; /* don't let rem overflow */
+ t2 -= d1;
+ }
+#else /* !BN_LLONG */
+ BN_ULONG t2l, t2h;
+
+#if defined(div_asm)
+ q = div_asm(n0, n1, d0);
+#else
+ q = bn_div_words(n0, n1, d0);
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+ rem = (n1 - q * d0) & BN_MASK2;
+#endif
+
+#if defined(BN_UMULT_LOHI)
+ BN_UMULT_LOHI(t2l, t2h, d1, q);
+#elif defined(BN_UMULT_HIGH)
+ t2l = d1 * q;
+ t2h = BN_UMULT_HIGH(d1, q);
+#else
+ {
+ BN_ULONG ql, qh;
+ t2l = LBITS(d1);
+ t2h = HBITS(d1);
+ ql = LBITS(q);
+ qh = HBITS(q);
+ mul64(t2l, t2h, ql, qh); /* t2=(BN_ULLONG)d1*q; */
+ }
+#endif
+
+ for (;;) {
+ if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2])))
+ break;
+ q--;
+ rem += d0;
+ if (rem < d0)
+ break; /* don't let rem overflow */
+ if (t2l < d1)
+ t2h--;
+ t2l -= d1;
+ }
+#endif /* !BN_LLONG */
+ }
+
+ l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
+ tmp->d[div_n] = l0;
+ wnum.d--;
+ /* ingore top values of the bignums just sub the two
+ * BN_ULONG arrays with bn_sub_words */
+ if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
+ /* Note: As we have considered only the leading
+ * two BN_ULONGs in the calculation of q, sdiv * q
+ * might be greater than wnum (but then (q-1) * sdiv
+ * is less or equal than wnum)
+ */
+ q--;
+ if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) {
+ /* we can't have an overflow here (assuming
+ * that q != 0, but if q == 0 then tmp is
+ * zero anyway) */
+ (*wnump)++;
+ }
+ }
+ /* store part of the result */
+ *resp = q;
+ }
+ bn_correct_top(snum);
+ if (rm != NULL) {
+ /* Keep a copy of the neg flag in num because if rm==num
+ * BN_rshift() will overwrite it.
+ */
+ int neg = num->neg;
+ BN_rshift(rm, snum, norm_shift);
+ if (!BN_is_zero(rm)) {
+ rm->neg = neg;
+ }
+ }
+ if (no_branch) {
+ bn_correct_top(res);
+ }
+ BN_CTX_end(ctx);
+ return 1;
+
+err:
+ BN_CTX_end(ctx);
+ return 0;
+}
+
+int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) {
+ if (!(BN_mod(r, m, d, ctx))) {
+ return 0;
+ }
+ if (!r->neg) {
+ return 1;
+ }
+
+ /* now -|d| < r < 0, so we have to set r := r + |d|. */
+ return (d->neg ? BN_sub : BN_add)(r, r, d);
+}
+
+int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+ BN_CTX *ctx) {
+ if (!BN_add(r, a, b)) {
+ return 0;
+ }
+ return BN_nnmod(r, r, m, ctx);
+}
+
+int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+ const BIGNUM *m) {
+ if (!BN_uadd(r, a, b)) {
+ return 0;
+ }
+ if (BN_ucmp(r, m) >= 0) {
+ return BN_usub(r, r, m);
+ }
+ return 1;
+}
+
+int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+ BN_CTX *ctx) {
+ if (!BN_sub(r, a, b)) {
+ return 0;
+ }
+ return BN_nnmod(r, r, m, ctx);
+}
+
+/* BN_mod_sub variant that may be used if both a and b are non-negative
+ * and less than m */
+int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+ const BIGNUM *m) {
+ if (!BN_sub(r, a, b)) {
+ return 0;
+ }
+ if (r->neg) {
+ return BN_add(r, r, m);
+ }
+ return 1;
+}
+
+int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+ BN_CTX *ctx) {
+ BIGNUM *t;
+ int ret = 0;
+
+ BN_CTX_start(ctx);
+ t = BN_CTX_get(ctx);
+ if (t == NULL) {
+ goto err;
+ }
+
+ if (a == b) {
+ if (!BN_sqr(t, a, ctx)) {
+ goto err;
+ }
+ } else {
+ if (!BN_mul(t, a, b, ctx)) {
+ goto err;
+ }
+ }
+
+ if (!BN_nnmod(r, t, m, ctx)) {
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
+ if (!BN_sqr(r, a, ctx)) {
+ return 0;
+ }
+
+ /* r->neg == 0, thus we don't need BN_nnmod */
+ return BN_mod(r, r, m, ctx);
+}
+
+int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+ BN_CTX *ctx) {
+ BIGNUM *abs_m = NULL;
+ int ret;
+
+ if (!BN_nnmod(r, a, m, ctx)) {
+ return 0;
+ }
+
+ if (m->neg) {
+ abs_m = BN_dup(m);
+ if (abs_m == NULL) {
+ return 0;
+ }
+ abs_m->neg = 0;
+ }
+
+ ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
+
+ if (abs_m) {
+ BN_free(abs_m);
+ }
+ return ret;
+}
+
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
+ if (r != a) {
+ if (BN_copy(r, a) == NULL) {
+ return 0;
+ }
+ }
+
+ while (n > 0) {
+ int max_shift;
+
+ /* 0 < r < m */
+ max_shift = BN_num_bits(m) - BN_num_bits(r);
+ /* max_shift >= 0 */
+
+ if (max_shift < 0) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_lshift_quick, BN_R_INPUT_NOT_REDUCED);
+ return 0;
+ }
+
+ if (max_shift > n) {
+ max_shift = n;
+ }
+
+ if (max_shift) {
+ if (!BN_lshift(r, r, max_shift)) {
+ return 0;
+ }
+ n -= max_shift;
+ } else {
+ if (!BN_lshift1(r, r)) {
+ return 0;
+ }
+ --n;
+ }
+
+ /* BN_num_bits(r) <= BN_num_bits(m) */
+ if (BN_cmp(r, m) >= 0) {
+ if (!BN_sub(r, r, m)) {
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
+ if (!BN_lshift1(r, a)) {
+ return 0;
+ }
+
+ return BN_nnmod(r, r, m, ctx);
+}
+
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
+ if (!BN_lshift1(r, a)) {
+ return 0;
+ }
+ if (BN_cmp(r, m) >= 0) {
+ return BN_sub(r, r, m);
+ }
+
+ return 1;
+}
+
+BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
+ BN_ULONG ret = 0;
+ int i, j;
+
+ w &= BN_MASK2;
+
+ if (!w) {
+ /* actually this an error (division by zero) */
+ return (BN_ULONG) - 1;
+ }
+
+ if (a->top == 0) {
+ return 0;
+ }
+
+ /* normalize input (so bn_div_words doesn't complain) */
+ j = BN_BITS2 - BN_num_bits_word(w);
+ w <<= j;
+ if (!BN_lshift(a, a, j)) {
+ return (BN_ULONG) - 1;
+ }
+
+ for (i = a->top - 1; i >= 0; i--) {
+ BN_ULONG l, d;
+
+ l = a->d[i];
+ d = bn_div_words(ret, l, w);
+ ret = (l - ((d * w) & BN_MASK2)) & BN_MASK2;
+ a->d[i] = d;
+ }
+
+ if ((a->top > 0) && (a->d[a->top - 1] == 0)) {
+ a->top--;
+ }
+
+ ret >>= j;
+ return ret;
+}
+
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
+#ifndef BN_LLONG
+ BN_ULONG ret = 0;
+#else
+ BN_ULLONG ret = 0;
+#endif
+ int i;
+
+ if (w == 0) {
+ return (BN_ULONG) -1;
+ }
+
+ w &= BN_MASK2;
+ for (i = a->top - 1; i >= 0; i--) {
+#ifndef BN_LLONG
+ ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
+ ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
+#else
+ ret = (BN_ULLONG)(((ret << (BN_ULLONG)BN_BITS2) | a->d[i]) % (BN_ULLONG)w);
+#endif
+ }
+ return (BN_ULONG)ret;
+}
diff --git a/src/crypto/bn/exponentiation.c b/src/crypto/bn/exponentiation.c
new file mode 100644
index 0000000..53f3e9c
--- /dev/null
+++ b/src/crypto/bn/exponentiation.c
@@ -0,0 +1,1535 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/cpu.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+#define OPENSSL_BN_ASM_MONT5
+#define RSAZ_ENABLED
+
+#include "rsaz_exp.h"
+#endif
+
+int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
+ int i, bits, ret = 0;
+ BIGNUM *v, *rr;
+
+ if ((p->flags & BN_FLG_CONSTTIME) != 0) {
+ /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+ OPENSSL_PUT_ERROR(BN, BN_exp, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ if (r == a || r == p) {
+ rr = BN_CTX_get(ctx);
+ } else {
+ rr = r;
+ }
+
+ v = BN_CTX_get(ctx);
+ if (rr == NULL || v == NULL) {
+ goto err;
+ }
+
+ if (BN_copy(v, a) == NULL) {
+ goto err;
+ }
+ bits = BN_num_bits(p);
+
+ if (BN_is_odd(p)) {
+ if (BN_copy(rr, a) == NULL) {
+ goto err;
+ }
+ } else {
+ if (!BN_one(rr)) {
+ goto err;
+ }
+ }
+
+ for (i = 1; i < bits; i++) {
+ if (!BN_sqr(v, v, ctx)) {
+ goto err;
+ }
+ if (BN_is_bit_set(p, i)) {
+ if (!BN_mul(rr, rr, v, ctx)) {
+ goto err;
+ }
+ }
+ }
+ ret = 1;
+
+err:
+ if (r != rr) {
+ BN_copy(r, rr);
+ }
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+/* maximum precomputation table size for *variable* sliding windows */
+#define TABLE_SIZE 32
+
+typedef struct bn_recp_ctx_st {
+ BIGNUM N; /* the divisor */
+ BIGNUM Nr; /* the reciprocal */
+ int num_bits;
+ int shift;
+ int flags;
+} BN_RECP_CTX;
+
+static void BN_RECP_CTX_init(BN_RECP_CTX *recp) {
+ BN_init(&recp->N);
+ BN_init(&recp->Nr);
+ recp->num_bits = 0;
+ recp->flags = 0;
+}
+
+static void BN_RECP_CTX_free(BN_RECP_CTX *recp) {
+ if (recp == NULL) {
+ return;
+ }
+
+ BN_free(&recp->N);
+ BN_free(&recp->Nr);
+}
+
+static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) {
+ if (!BN_copy(&(recp->N), d)) {
+ return 0;
+ }
+ BN_zero(&recp->Nr);
+ recp->num_bits = BN_num_bits(d);
+ recp->shift = 0;
+
+ return 1;
+}
+
+/* len is the expected size of the result We actually calculate with an extra
+ * word of precision, so we can do faster division if the remainder is not
+ * required.
+ * r := 2^len / m */
+static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) {
+ int ret = -1;
+ BIGNUM *t;
+
+ BN_CTX_start(ctx);
+ t = BN_CTX_get(ctx);
+ if (t == NULL) {
+ goto err;
+ }
+
+ if (!BN_set_bit(t, len)) {
+ goto err;
+ }
+
+ if (!BN_div(r, NULL, t, m, ctx)) {
+ goto err;
+ }
+
+ ret = len;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
+ BN_RECP_CTX *recp, BN_CTX *ctx) {
+ int i, j, ret = 0;
+ BIGNUM *a, *b, *d, *r;
+
+ BN_CTX_start(ctx);
+ a = BN_CTX_get(ctx);
+ b = BN_CTX_get(ctx);
+ if (dv != NULL) {
+ d = dv;
+ } else {
+ d = BN_CTX_get(ctx);
+ }
+
+ if (rem != NULL) {
+ r = rem;
+ } else {
+ r = BN_CTX_get(ctx);
+ }
+
+ if (a == NULL || b == NULL || d == NULL || r == NULL) {
+ goto err;
+ }
+
+ if (BN_ucmp(m, &(recp->N)) < 0) {
+ BN_zero(d);
+ if (!BN_copy(r, m)) {
+ return 0;
+ }
+ BN_CTX_end(ctx);
+ return 1;
+ }
+
+ /* We want the remainder
+ * Given input of ABCDEF / ab
+ * we need multiply ABCDEF by 3 digests of the reciprocal of ab */
+
+ /* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
+ i = BN_num_bits(m);
+ j = recp->num_bits << 1;
+ if (j > i) {
+ i = j;
+ }
+
+ /* Nr := round(2^i / N) */
+ if (i != recp->shift) {
+ recp->shift =
+ BN_reciprocal(&(recp->Nr), &(recp->N), i,
+ ctx); /* BN_reciprocal returns i, or -1 for an error */
+ }
+
+ if (recp->shift == -1) {
+ goto err;
+ }
+
+ /* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i -
+ * BN_num_bits(N)))|
+ * = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i -
+ * BN_num_bits(N)))|
+ * <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
+ * = |m/N| */
+ if (!BN_rshift(a, m, recp->num_bits)) {
+ goto err;
+ }
+ if (!BN_mul(b, a, &(recp->Nr), ctx)) {
+ goto err;
+ }
+ if (!BN_rshift(d, b, i - recp->num_bits)) {
+ goto err;
+ }
+ d->neg = 0;
+
+ if (!BN_mul(b, &(recp->N), d, ctx)) {
+ goto err;
+ }
+ if (!BN_usub(r, m, b)) {
+ goto err;
+ }
+ r->neg = 0;
+
+ j = 0;
+ while (BN_ucmp(r, &(recp->N)) >= 0) {
+ if (j++ > 2) {
+ OPENSSL_PUT_ERROR(BN, BN_div_recp, BN_R_BAD_RECIPROCAL);
+ goto err;
+ }
+ if (!BN_usub(r, r, &(recp->N))) {
+ goto err;
+ }
+ if (!BN_add_word(d, 1)) {
+ goto err;
+ }
+ }
+
+ r->neg = BN_is_zero(r) ? 0 : m->neg;
+ d->neg = m->neg ^ recp->N.neg;
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
+ BN_RECP_CTX *recp, BN_CTX *ctx) {
+ int ret = 0;
+ BIGNUM *a;
+ const BIGNUM *ca;
+
+ BN_CTX_start(ctx);
+ a = BN_CTX_get(ctx);
+ if (a == NULL) {
+ goto err;
+ }
+
+ if (y != NULL) {
+ if (x == y) {
+ if (!BN_sqr(a, x, ctx)) {
+ goto err;
+ }
+ } else {
+ if (!BN_mul(a, x, y, ctx)) {
+ goto err;
+ }
+ }
+ ca = a;
+ } else {
+ ca = x; /* Just do the mod */
+ }
+
+ ret = BN_div_recp(NULL, r, ca, recp, ctx);
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+/* BN_window_bits_for_exponent_size -- macro for sliding window mod_exp
+ * functions
+ *
+ * For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of
+ * multiplications is a constant plus on average
+ *
+ * 2^(w-1) + (b-w)/(w+1);
+ *
+ * here 2^(w-1) is for precomputing the table (we actually need entries only
+ * for windows that have the lowest bit set), and (b-w)/(w+1) is an
+ * approximation for the expected number of w-bit windows, not counting the
+ * first one.
+ *
+ * Thus we should use
+ *
+ * w >= 6 if b > 671
+ * w = 5 if 671 > b > 239
+ * w = 4 if 239 > b > 79
+ * w = 3 if 79 > b > 23
+ * w <= 2 if 23 > b
+ *
+ * (with draws in between). Very small exponents are often selected
+ * with low Hamming weight, so we use w = 1 for b <= 23. */
+#define BN_window_bits_for_exponent_size(b) \
+ ((b) > 671 ? 6 : \
+ (b) > 239 ? 5 : \
+ (b) > 79 ? 4 : \
+ (b) > 23 ? 3 : 1)
+
+static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx) {
+ int i, j, bits, ret = 0, wstart, window;
+ int start = 1;
+ BIGNUM *aa;
+ /* Table of variables obtained from 'ctx' */
+ BIGNUM *val[TABLE_SIZE];
+ BN_RECP_CTX recp;
+
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
+ /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+ OPENSSL_PUT_ERROR(BN, mod_exp_recp, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+
+ bits = BN_num_bits(p);
+
+ if (bits == 0) {
+ ret = BN_one(r);
+ return ret;
+ }
+
+ BN_CTX_start(ctx);
+ aa = BN_CTX_get(ctx);
+ val[0] = BN_CTX_get(ctx);
+ if (!aa || !val[0]) {
+ goto err;
+ }
+
+ BN_RECP_CTX_init(&recp);
+ if (m->neg) {
+ /* ignore sign of 'm' */
+ if (!BN_copy(aa, m)) {
+ goto err;
+ }
+ aa->neg = 0;
+ if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) {
+ goto err;
+ }
+ } else {
+ if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) {
+ goto err;
+ }
+ }
+
+ if (!BN_nnmod(val[0], a, m, ctx)) {
+ goto err; /* 1 */
+ }
+ if (BN_is_zero(val[0])) {
+ BN_zero(r);
+ ret = 1;
+ goto err;
+ }
+
+ window = BN_window_bits_for_exponent_size(bits);
+ if (window > 1) {
+ if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) {
+ goto err; /* 2 */
+ }
+ j = 1 << (window - 1);
+ for (i = 1; i < j; i++) {
+ if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
+ !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) {
+ goto err;
+ }
+ }
+ }
+
+ start = 1; /* This is used to avoid multiplication etc
+ * when there is only the value '1' in the
+ * buffer. */
+ wstart = bits - 1; /* The top bit of the window */
+
+ if (!BN_one(r)) {
+ goto err;
+ }
+
+ for (;;) {
+ int wvalue; /* The 'value' of the window */
+ int wend; /* The bottom bit of the window */
+
+ if (BN_is_bit_set(p, wstart) == 0) {
+ if (!start) {
+ if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) {
+ goto err;
+ }
+ }
+ if (wstart == 0) {
+ break;
+ }
+ wstart--;
+ continue;
+ }
+
+ /* We now have wstart on a 'set' bit, we now need to work out
+ * how bit a window to do. To do this we need to scan
+ * forward until the last set bit before the end of the
+ * window */
+ wvalue = 1;
+ wend = 0;
+ for (i = 1; i < window; i++) {
+ if (wstart - i < 0) {
+ break;
+ }
+ if (BN_is_bit_set(p, wstart - i)) {
+ wvalue <<= (i - wend);
+ wvalue |= 1;
+ wend = i;
+ }
+ }
+
+ /* wend is the size of the current window */
+ j = wend + 1;
+ /* add the 'bytes above' */
+ if (!start) {
+ for (i = 0; i < j; i++) {
+ if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) {
+ goto err;
+ }
+ }
+ }
+
+ /* wvalue will be an odd number < 2^window */
+ if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) {
+ goto err;
+ }
+
+ /* move the 'window' down further */
+ wstart -= wend + 1;
+ start = 0;
+ if (wstart < 0) {
+ break;
+ }
+ }
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ BN_RECP_CTX_free(&recp);
+ return ret;
+}
+
+int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
+ BN_CTX *ctx) {
+ /* For even modulus m = 2^k*m_odd, it might make sense to compute
+ * a^p mod m_odd and a^p mod 2^k separately (with Montgomery
+ * exponentiation for the odd part), using appropriate exponent
+ * reductions, and combine the results using the CRT.
+ *
+ * For now, we use Montgomery only if the modulus is odd; otherwise,
+ * exponentiation using the reciprocal-based quick remaindering
+ * algorithm is used.
+ *
+ * (Timing obtained with expspeed.c [computations a^p mod m
+ * where a, p, m are of the same length: 256, 512, 1024, 2048,
+ * 4096, 8192 bits], compared to the running time of the
+ * standard algorithm:
+ *
+ * BN_mod_exp_mont 33 .. 40 % [AMD K6-2, Linux, debug configuration]
+ * 55 .. 77 % [UltraSparc processor, but
+ * debug-solaris-sparcv8-gcc conf.]
+ *
+ * BN_mod_exp_recp 50 .. 70 % [AMD K6-2, Linux, debug configuration]
+ * 62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
+ *
+ * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
+ * at 2048 and more bits, but at 512 and 1024 bits, it was
+ * slower even than the standard algorithm!
+ *
+ * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
+ * should be obtained when the new Montgomery reduction code
+ * has been integrated into OpenSSL.) */
+
+ if (BN_is_odd(m)) {
+ if (a->top == 1 && !a->neg && BN_get_flags(p, BN_FLG_CONSTTIME) == 0) {
+ BN_ULONG A = a->d[0];
+ return BN_mod_exp_mont_word(r, A, p, m, ctx, NULL);
+ }
+
+ return BN_mod_exp_mont(r, a, p, m, ctx, NULL);
+ }
+
+ return mod_exp_recp(r, a, p, m, ctx);
+}
+
+int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) {
+ int i, j, bits, ret = 0, wstart, window;
+ int start = 1;
+ BIGNUM *d, *r;
+ const BIGNUM *aa;
+ /* Table of variables obtained from 'ctx' */
+ BIGNUM *val[TABLE_SIZE];
+ BN_MONT_CTX *mont = NULL;
+
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
+ return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
+ }
+
+ if (!BN_is_odd(m)) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont, BN_R_CALLED_WITH_EVEN_MODULUS);
+ return 0;
+ }
+ bits = BN_num_bits(p);
+ if (bits == 0) {
+ ret = BN_one(rr);
+ return ret;
+ }
+
+ BN_CTX_start(ctx);
+ d = BN_CTX_get(ctx);
+ r = BN_CTX_get(ctx);
+ val[0] = BN_CTX_get(ctx);
+ if (!d || !r || !val[0]) {
+ goto err;
+ }
+
+ /* If this is not done, things will break in the montgomery part */
+
+ if (in_mont != NULL) {
+ mont = in_mont;
+ } else {
+ mont = BN_MONT_CTX_new();
+ if (mont == NULL) {
+ goto err;
+ }
+ if (!BN_MONT_CTX_set(mont, m, ctx)) {
+ goto err;
+ }
+ }
+
+ if (a->neg || BN_ucmp(a, m) >= 0) {
+ if (!BN_nnmod(val[0], a, m, ctx)) {
+ goto err;
+ }
+ aa = val[0];
+ } else {
+ aa = a;
+ }
+
+ if (BN_is_zero(aa)) {
+ BN_zero(rr);
+ ret = 1;
+ goto err;
+ }
+ if (!BN_to_montgomery(val[0], aa, mont, ctx)) {
+ goto err; /* 1 */
+ }
+
+ window = BN_window_bits_for_exponent_size(bits);
+ if (window > 1) {
+ if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) {
+ goto err; /* 2 */
+ }
+ j = 1 << (window - 1);
+ for (i = 1; i < j; i++) {
+ if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
+ !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) {
+ goto err;
+ }
+ }
+ }
+
+ start = 1; /* This is used to avoid multiplication etc
+ * when there is only the value '1' in the
+ * buffer. */
+ wstart = bits - 1; /* The top bit of the window */
+
+ j = m->top; /* borrow j */
+ if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+ if (bn_wexpand(r, j) == NULL)
+ goto err;
+ /* 2^(top*BN_BITS2) - m */
+ r->d[0] = (0 - m->d[0]) & BN_MASK2;
+ for (i = 1; i < j; i++)
+ r->d[i] = (~m->d[i]) & BN_MASK2;
+ r->top = j;
+ /* Upper words will be zero if the corresponding words of 'm'
+ * were 0xfff[...], so decrement r->top accordingly. */
+ bn_correct_top(r);
+ } else if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
+ goto err;
+ }
+
+ for (;;) {
+ int wvalue; /* The 'value' of the window */
+ int wend; /* The bottom bit of the window */
+
+ if (BN_is_bit_set(p, wstart) == 0) {
+ if (!start) {
+ if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
+ goto err;
+ }
+ if (wstart == 0) {
+ break;
+ }
+ wstart--;
+ continue;
+ }
+
+ /* We now have wstart on a 'set' bit, we now need to work out how bit a
+ * window to do. To do this we need to scan forward until the last set bit
+ * before the end of the window */
+ wvalue = 1;
+ wend = 0;
+ for (i = 1; i < window; i++) {
+ if (wstart - i < 0) {
+ break;
+ }
+ if (BN_is_bit_set(p, wstart - i)) {
+ wvalue <<= (i - wend);
+ wvalue |= 1;
+ wend = i;
+ }
+ }
+
+ /* wend is the size of the current window */
+ j = wend + 1;
+ /* add the 'bytes above' */
+ if (!start) {
+ for (i = 0; i < j; i++) {
+ if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
+ goto err;
+ }
+ }
+ }
+
+ /* wvalue will be an odd number < 2^window */
+ if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
+ goto err;
+ }
+
+ /* move the 'window' down further */
+ wstart -= wend + 1;
+ start = 0;
+ if (wstart < 0) {
+ break;
+ }
+ }
+
+ if (!BN_from_montgomery(rr, r, mont, ctx)) {
+ goto err;
+ }
+ ret = 1;
+
+err:
+ if (in_mont == NULL && mont != NULL) {
+ BN_MONT_CTX_free(mont);
+ }
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+/* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
+ * layout so that accessing any of these table values shows the same access
+ * pattern as far as cache lines are concerned. The following functions are
+ * used to transfer a BIGNUM from/to that table. */
+static int copy_to_prebuf(const BIGNUM *b, int top, unsigned char *buf, int idx,
+ int width) {
+ size_t i, j;
+
+ if (top > b->top) {
+ top = b->top; /* this works because 'buf' is explicitly zeroed */
+ }
+ for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
+ buf[j] = ((unsigned char *)b->d)[i];
+ }
+
+ return 1;
+}
+
+static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx,
+ int width) {
+ size_t i, j;
+
+ if (bn_wexpand(b, top) == NULL) {
+ return 0;
+ }
+
+ for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
+ ((unsigned char *)b->d)[i] = buf[j];
+ }
+
+ b->top = top;
+ bn_correct_top(b);
+ return 1;
+}
+
+/* BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache
+ * line width of the target processor is at least the following value. */
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH (64)
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK \
+ (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
+
+/* Window sizes optimized for fixed window size modular exponentiation
+ * algorithm (BN_mod_exp_mont_consttime).
+ *
+ * To achieve the security goals of BN_mode_exp_mont_consttime, the maximum
+ * size of the window must not exceed
+ * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH).
+ *
+ * Window size thresholds are defined for cache line sizes of 32 and 64, cache
+ * line sizes where log_2(32)=5 and log_2(64)=6 respectively. A window size of
+ * 7 should only be used on processors that have a 128 byte or greater cache
+ * line size. */
+#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
+
+#define BN_window_bits_for_ctime_exponent_size(b) \
+ ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
+#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
+
+#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
+
+#define BN_window_bits_for_ctime_exponent_size(b) \
+ ((b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
+#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
+
+#endif
+
+/* Given a pointer value, compute the next address that is a cache line
+ * multiple. */
+#define MOD_EXP_CTIME_ALIGN(x_) \
+ ((unsigned char *)(x_) + \
+ (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - \
+ (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+
+/* This variant of BN_mod_exp_mont() uses fixed windows and the special
+ * precomputation memory layout to limit data-dependency to a minimum
+ * to protect secret exponents (cf. the hyper-threading timing attacks
+ * pointed out by Colin Percival,
+ * http://www.daemonology.net/hyperthreading-considered-harmful/)
+ */
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx,
+ BN_MONT_CTX *in_mont) {
+ int i, bits, ret = 0, window, wvalue;
+ int top;
+ BN_MONT_CTX *mont = NULL;
+
+ int numPowers;
+ unsigned char *powerbufFree = NULL;
+ int powerbufLen = 0;
+ unsigned char *powerbuf = NULL;
+ BIGNUM tmp, am;
+
+ top = m->top;
+
+ if (!(m->d[0] & 1)) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont_consttime,
+ BN_R_CALLED_WITH_EVEN_MODULUS);
+ return 0;
+ }
+ bits = BN_num_bits(p);
+ if (bits == 0) {
+ ret = BN_one(rr);
+ return ret;
+ }
+
+ BN_CTX_start(ctx);
+
+ /* Allocate a montgomery context if it was not supplied by the caller.
+ * If this is not done, things will break in the montgomery part.
+ */
+ if (in_mont != NULL)
+ mont = in_mont;
+ else {
+ if ((mont = BN_MONT_CTX_new()) == NULL)
+ goto err;
+ if (!BN_MONT_CTX_set(mont, m, ctx))
+ goto err;
+ }
+
+#ifdef RSAZ_ENABLED
+ /* If the size of the operands allow it, perform the optimized
+ * RSAZ exponentiation. For further information see
+ * crypto/bn/rsaz_exp.c and accompanying assembly modules. */
+ if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) &&
+ rsaz_avx2_eligible()) {
+ if (NULL == bn_wexpand(rr, 16))
+ goto err;
+ RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]);
+ rr->top = 16;
+ rr->neg = 0;
+ bn_correct_top(rr);
+ ret = 1;
+ goto err;
+ } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
+ if (NULL == bn_wexpand(rr, 8))
+ goto err;
+ RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
+ rr->top = 8;
+ rr->neg = 0;
+ bn_correct_top(rr);
+ ret = 1;
+ goto err;
+ }
+#endif
+
+ /* Get the window size to use with size of p. */
+ window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+ if (window >= 5) {
+ window = 5; /* ~5% improvement for RSA2048 sign, and even for RSA4096 */
+ if ((top & 7) == 0)
+ powerbufLen += 2 * top * sizeof(m->d[0]);
+ }
+#endif
+ (void)0;
+
+ /* Allocate a buffer large enough to hold all of the pre-computed
+ * powers of am, am itself and tmp.
+ */
+ numPowers = 1 << window;
+ powerbufLen +=
+ sizeof(m->d[0]) *
+ (top * numPowers + ((2 * top) > numPowers ? (2 * top) : numPowers));
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+ else
+#endif
+ if ((powerbufFree = (unsigned char *)OPENSSL_malloc(
+ powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
+ goto err;
+
+ powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
+ memset(powerbuf, 0, powerbufLen);
+
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = NULL;
+#endif
+
+ /* lay down tmp and am right after powers table */
+ tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
+ am.d = tmp.d + top;
+ tmp.top = am.top = 0;
+ tmp.dmax = am.dmax = top;
+ tmp.neg = am.neg = 0;
+ tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+/* prepare a^0 in Montgomery domain */
+/* by Shay Gueron's suggestion */
+ if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+ /* 2^(top*BN_BITS2) - m */
+ tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
+ for (i = 1; i < top; i++)
+ tmp.d[i] = (~m->d[i]) & BN_MASK2;
+ tmp.top = top;
+ } else if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
+ goto err;
+
+ /* prepare a^1 in Montgomery domain */
+ if (a->neg || BN_ucmp(a, m) >= 0) {
+ if (!BN_mod(&am, a, m, ctx))
+ goto err;
+ if (!BN_to_montgomery(&am, &am, mont, ctx))
+ goto err;
+ } else if (!BN_to_montgomery(&am, a, mont, ctx))
+ goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+ /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+ * specifically optimization of cache-timing attack countermeasures
+ * and pre-computation optimization. */
+
+ /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+ * 512-bit RSA is hardly relevant, we omit it to spare size... */
+ if (window == 5 && top > 1) {
+ void bn_mul_mont_gather5(BN_ULONG * rp, const BN_ULONG * ap,
+ const void * table, const BN_ULONG * np,
+ const BN_ULONG * n0, int num, int power);
+ void bn_scatter5(const BN_ULONG * inp, size_t num, void * table,
+ size_t power);
+ void bn_gather5(BN_ULONG * out, size_t num, void * table, size_t power);
+ void bn_power5(BN_ULONG * rp, const BN_ULONG * ap, const void * table,
+ const BN_ULONG * np, const BN_ULONG * n0, int num,
+ int power);
+ int bn_from_montgomery(BN_ULONG * rp, const BN_ULONG * ap,
+ const BN_ULONG * not_used, const BN_ULONG * np,
+ const BN_ULONG * n0, int num);
+
+ BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+
+ /* BN_to_montgomery can contaminate words above .top
+ * [in BN_DEBUG[_DEBUG] build]... */
+ for (i = am.top; i < top; i++)
+ am.d[i] = 0;
+ for (i = tmp.top; i < top; i++)
+ tmp.d[i] = 0;
+
+ if (top & 7)
+ np2 = np;
+ else
+ for (np2 = am.d + top, i = 0; i < top; i++)
+ np2[2 * i] = np[i];
+
+ bn_scatter5(tmp.d, top, powerbuf, 0);
+ bn_scatter5(am.d, am.top, powerbuf, 1);
+ bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, 2);
+
+ /* same as above, but uses squaring for 1/2 of operations */
+ for (i = 4; i < 32; i *= 2) {
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ }
+ for (i = 3; i < 8; i += 2) {
+ int j;
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ for (j = 2 * i; j < 32; j *= 2) {
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, j);
+ }
+ }
+ for (; i < 16; i += 2) {
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_scatter5(tmp.d, top, powerbuf, 2 * i);
+ }
+ for (; i < 32; i += 2) {
+ bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+ bn_scatter5(tmp.d, top, powerbuf, i);
+ }
+
+ bits--;
+ for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ bn_gather5(tmp.d, top, powerbuf, wvalue);
+
+ /* At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit
+ * that has not been read yet.) */
+ assert(bits >= -1 && (bits == -1 || bits % 5 == 4));
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ if (top & 7) {
+ while (bits >= 0) {
+ for (wvalue = 0, i = 0; i < 5; i++, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+ bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
+ }
+ } else {
+ const uint8_t *p_bytes = (const uint8_t *)p->d;
+ int max_bits = p->top * BN_BITS2;
+ assert(bits < max_bits);
+ /* |p = 0| has been handled as a special case, so |max_bits| is at least
+ * one word. */
+ assert(max_bits >= 64);
+
+ /* If the first bit to be read lands in the last byte, unroll the first
+ * iteration to avoid reading past the bounds of |p->d|. (After the first
+ * iteration, we are guaranteed to be past the last byte.) Note |bits|
+ * here is the top bit, inclusive. */
+ if (bits - 4 >= max_bits - 8) {
+ /* Read five bits from |bits-4| through |bits|, inclusive. */
+ wvalue = p_bytes[p->top * BN_BYTES - 1];
+ wvalue >>= (bits - 4) & 7;
+ wvalue &= 0x1f;
+ bits -= 5;
+ bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+ }
+ while (bits >= 0) {
+ /* Read five bits from |bits-4| through |bits|, inclusive. */
+ int first_bit = bits - 4;
+ wvalue = *(const uint16_t *) (p_bytes + (first_bit >> 3));
+ wvalue >>= first_bit & 7;
+ wvalue &= 0x1f;
+ bits -= 5;
+ bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+ }
+ }
+
+ ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+ tmp.top = top;
+ bn_correct_top(&tmp);
+ if (ret) {
+ if (!BN_copy(rr, &tmp))
+ ret = 0;
+ goto err; /* non-zero ret means it's not error */
+ }
+ } else
+#endif
+ {
+ if (!copy_to_prebuf(&tmp, top, powerbuf, 0, numPowers))
+ goto err;
+ if (!copy_to_prebuf(&am, top, powerbuf, 1, numPowers))
+ goto err;
+
+ /* If the window size is greater than 1, then calculate
+ * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
+ * (even powers could instead be computed as (a^(i/2))^2
+ * to use the slight performance advantage of sqr over mul).
+ */
+ if (window > 1) {
+ if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
+ goto err;
+ if (!copy_to_prebuf(&tmp, top, powerbuf, 2, numPowers))
+ goto err;
+ for (i = 3; i < numPowers; i++) {
+ /* Calculate a^i = a^(i-1) * a */
+ if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
+ goto err;
+ if (!copy_to_prebuf(&tmp, top, powerbuf, i, numPowers))
+ goto err;
+ }
+ }
+
+ bits--;
+ for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, numPowers))
+ goto err;
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0) {
+ wvalue = 0; /* The 'value' of the window */
+
+ /* Scan the window, squaring the result as we go */
+ for (i = 0; i < window; i++, bits--) {
+ if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx))
+ goto err;
+ wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+ }
+
+ /* Fetch the appropriate pre-computed value from the pre-buf */
+ if (!copy_from_prebuf(&am, top, powerbuf, wvalue, numPowers))
+ goto err;
+
+ /* Multiply the result into the intermediate result */
+ if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx))
+ goto err;
+ }
+ }
+
+ /* Convert the final result from montgomery to standard format */
+ if (!BN_from_montgomery(rr, &tmp, mont, ctx))
+ goto err;
+ ret = 1;
+err:
+ if ((in_mont == NULL) && (mont != NULL))
+ BN_MONT_CTX_free(mont);
+ if (powerbuf != NULL) {
+ OPENSSL_cleanse(powerbuf, powerbufLen);
+ if (powerbufFree)
+ OPENSSL_free(powerbufFree);
+ }
+ BN_CTX_end(ctx);
+ return (ret);
+}
+
+int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) {
+ BN_MONT_CTX *mont = NULL;
+ int b, bits, ret = 0;
+ int r_is_one;
+ BN_ULONG w, next_w;
+ BIGNUM *d, *r, *t;
+ BIGNUM *swap_tmp;
+#define BN_MOD_MUL_WORD(r, w, m) \
+ (BN_mul_word(r, (w)) && \
+ (/* BN_ucmp(r, (m)) < 0 ? 1 :*/ \
+ (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
+ /* BN_MOD_MUL_WORD is only used with 'w' large, so the BN_ucmp test is
+ * probably more overhead than always using BN_mod (which uses BN_copy if a
+ * similar test returns true). We can use BN_mod and do not need BN_nnmod
+ * because our accumulator is never negative (the result of BN_mod does not
+ * depend on the sign of the modulus). */
+#define BN_TO_MONTGOMERY_WORD(r, w, mont) \
+ (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
+
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
+ /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+ OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont_word,
+ ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+
+ if (!BN_is_odd(m)) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont_word, BN_R_CALLED_WITH_EVEN_MODULUS);
+ return 0;
+ }
+
+ if (m->top == 1) {
+ a %= m->d[0]; /* make sure that 'a' is reduced */
+ }
+
+ bits = BN_num_bits(p);
+ if (bits == 0) {
+ /* x**0 mod 1 is still zero. */
+ if (BN_is_one(m)) {
+ ret = 1;
+ BN_zero(rr);
+ } else {
+ ret = BN_one(rr);
+ }
+ return ret;
+ }
+ if (a == 0) {
+ BN_zero(rr);
+ ret = 1;
+ return ret;
+ }
+
+ BN_CTX_start(ctx);
+ d = BN_CTX_get(ctx);
+ r = BN_CTX_get(ctx);
+ t = BN_CTX_get(ctx);
+ if (d == NULL || r == NULL || t == NULL) {
+ goto err;
+ }
+
+ if (in_mont != NULL)
+ mont = in_mont;
+ else {
+ if ((mont = BN_MONT_CTX_new()) == NULL) {
+ goto err;
+ }
+ if (!BN_MONT_CTX_set(mont, m, ctx)) {
+ goto err;
+ }
+ }
+
+ r_is_one = 1; /* except for Montgomery factor */
+
+ /* bits-1 >= 0 */
+
+ /* The result is accumulated in the product r*w. */
+ w = a; /* bit 'bits-1' of 'p' is always set */
+ for (b = bits - 2; b >= 0; b--) {
+ /* First, square r*w. */
+ next_w = w * w;
+ if ((next_w / w) != w) {
+ /* overflow */
+ if (r_is_one) {
+ if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
+ goto err;
+ }
+ r_is_one = 0;
+ } else {
+ if (!BN_MOD_MUL_WORD(r, w, m)) {
+ goto err;
+ }
+ }
+ next_w = 1;
+ }
+
+ w = next_w;
+ if (!r_is_one) {
+ if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
+ goto err;
+ }
+ }
+
+ /* Second, multiply r*w by 'a' if exponent bit is set. */
+ if (BN_is_bit_set(p, b)) {
+ next_w = w * a;
+ if ((next_w / a) != w) {
+ /* overflow */
+ if (r_is_one) {
+ if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
+ goto err;
+ }
+ r_is_one = 0;
+ } else {
+ if (!BN_MOD_MUL_WORD(r, w, m)) {
+ goto err;
+ }
+ }
+ next_w = a;
+ }
+ w = next_w;
+ }
+ }
+
+ /* Finally, set r:=r*w. */
+ if (w != 1) {
+ if (r_is_one) {
+ if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
+ goto err;
+ }
+ r_is_one = 0;
+ } else {
+ if (!BN_MOD_MUL_WORD(r, w, m)) {
+ goto err;
+ }
+ }
+ }
+
+ if (r_is_one) {
+ /* can happen only if a == 1*/
+ if (!BN_one(rr)) {
+ goto err;
+ }
+ } else {
+ if (!BN_from_montgomery(rr, r, mont, ctx)) {
+ goto err;
+ }
+ }
+ ret = 1;
+
+err:
+ if (in_mont == NULL && mont != NULL) {
+ BN_MONT_CTX_free(mont);
+ }
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+#define TABLE_SIZE 32
+
+int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
+ const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
+ BN_CTX *ctx, BN_MONT_CTX *in_mont) {
+ int i, j, bits, b, bits1, bits2, ret = 0, wpos1, wpos2, window1, window2,
+ wvalue1, wvalue2;
+ int r_is_one = 1;
+ BIGNUM *d, *r;
+ const BIGNUM *a_mod_m;
+ /* Tables of variables obtained from 'ctx' */
+ BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
+ BN_MONT_CTX *mont = NULL;
+
+ if (!(m->d[0] & 1)) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_exp2_mont, BN_R_CALLED_WITH_EVEN_MODULUS);
+ return 0;
+ }
+ bits1 = BN_num_bits(p1);
+ bits2 = BN_num_bits(p2);
+ if (bits1 == 0 && bits2 == 0) {
+ ret = BN_one(rr);
+ return ret;
+ }
+
+ bits = (bits1 > bits2) ? bits1 : bits2;
+
+ BN_CTX_start(ctx);
+ d = BN_CTX_get(ctx);
+ r = BN_CTX_get(ctx);
+ val1[0] = BN_CTX_get(ctx);
+ val2[0] = BN_CTX_get(ctx);
+ if (!d || !r || !val1[0] || !val2[0]) {
+ goto err;
+ }
+
+ if (in_mont != NULL) {
+ mont = in_mont;
+ } else {
+ mont = BN_MONT_CTX_new();
+ if (mont == NULL) {
+ goto err;
+ }
+ if (!BN_MONT_CTX_set(mont, m, ctx)) {
+ goto err;
+ }
+ }
+
+ window1 = BN_window_bits_for_exponent_size(bits1);
+ window2 = BN_window_bits_for_exponent_size(bits2);
+
+ /* Build table for a1: val1[i] := a1^(2*i + 1) mod m for i = 0 ..
+ * 2^(window1-1) */
+ if (a1->neg || BN_ucmp(a1, m) >= 0) {
+ if (!BN_mod(val1[0], a1, m, ctx)) {
+ goto err;
+ }
+ a_mod_m = val1[0];
+ } else {
+ a_mod_m = a1;
+ }
+
+ if (BN_is_zero(a_mod_m)) {
+ BN_zero(rr);
+ ret = 1;
+ goto err;
+ }
+
+ if (!BN_to_montgomery(val1[0], a_mod_m, mont, ctx)) {
+ goto err;
+ }
+
+ if (window1 > 1) {
+ if (!BN_mod_mul_montgomery(d, val1[0], val1[0], mont, ctx)) {
+ goto err;
+ }
+
+ j = 1 << (window1 - 1);
+ for (i = 1; i < j; i++) {
+ if (((val1[i] = BN_CTX_get(ctx)) == NULL) ||
+ !BN_mod_mul_montgomery(val1[i], val1[i - 1], d, mont, ctx)) {
+ goto err;
+ }
+ }
+ }
+
+ /* Build table for a2: val2[i] := a2^(2*i + 1) mod m for i = 0 ..
+ * 2^(window2-1) */
+ if (a2->neg || BN_ucmp(a2, m) >= 0) {
+ if (!BN_mod(val2[0], a2, m, ctx)) {
+ goto err;
+ }
+ a_mod_m = val2[0];
+ } else {
+ a_mod_m = a2;
+ }
+
+ if (BN_is_zero(a_mod_m)) {
+ BN_zero(rr);
+ ret = 1;
+ goto err;
+ }
+
+ if (!BN_to_montgomery(val2[0], a_mod_m, mont, ctx)) {
+ goto err;
+ }
+
+ if (window2 > 1) {
+ if (!BN_mod_mul_montgomery(d, val2[0], val2[0], mont, ctx)) {
+ goto err;
+ }
+
+ j = 1 << (window2 - 1);
+ for (i = 1; i < j; i++) {
+ if (((val2[i] = BN_CTX_get(ctx)) == NULL) ||
+ !BN_mod_mul_montgomery(val2[i], val2[i - 1], d, mont, ctx)) {
+ goto err;
+ }
+ }
+ }
+
+ /* Now compute the power product, using independent windows. */
+ r_is_one = 1;
+ wvalue1 = 0; /* The 'value' of the first window */
+ wvalue2 = 0; /* The 'value' of the second window */
+ wpos1 = 0; /* If wvalue1 > 0, the bottom bit of the first window */
+ wpos2 = 0; /* If wvalue2 > 0, the bottom bit of the second window */
+
+ if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
+ goto err;
+ }
+
+ for (b = bits - 1; b >= 0; b--) {
+ if (!r_is_one) {
+ if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
+ goto err;
+ }
+ }
+
+ if (!wvalue1 && BN_is_bit_set(p1, b)) {
+ /* consider bits b-window1+1 .. b for this window */
+ i = b - window1 + 1;
+ while (!BN_is_bit_set(p1, i)) /* works for i<0 */
+ i++;
+ wpos1 = i;
+ wvalue1 = 1;
+ for (i = b - 1; i >= wpos1; i--) {
+ wvalue1 <<= 1;
+ if (BN_is_bit_set(p1, i))
+ wvalue1++;
+ }
+ }
+
+ if (!wvalue2 && BN_is_bit_set(p2, b)) {
+ /* consider bits b-window2+1 .. b for this window */
+ i = b - window2 + 1;
+ while (!BN_is_bit_set(p2, i))
+ i++;
+ wpos2 = i;
+ wvalue2 = 1;
+ for (i = b - 1; i >= wpos2; i--) {
+ wvalue2 <<= 1;
+ if (BN_is_bit_set(p2, i))
+ wvalue2++;
+ }
+ }
+
+ if (wvalue1 && b == wpos1) {
+ /* wvalue1 is odd and < 2^window1 */
+ if (!BN_mod_mul_montgomery(r, r, val1[wvalue1 >> 1], mont, ctx)) {
+ goto err;
+ }
+ wvalue1 = 0;
+ r_is_one = 0;
+ }
+
+ if (wvalue2 && b == wpos2) {
+ /* wvalue2 is odd and < 2^window2 */
+ if (!BN_mod_mul_montgomery(r, r, val2[wvalue2 >> 1], mont, ctx)) {
+ goto err;
+ }
+ wvalue2 = 0;
+ r_is_one = 0;
+ }
+ }
+
+ if (!BN_from_montgomery(rr, r, mont, ctx)) {
+ goto err;
+ }
+ ret = 1;
+
+err:
+ if (in_mont == NULL && mont != NULL) {
+ BN_MONT_CTX_free(mont);
+ }
+ BN_CTX_end(ctx);
+ return ret;
+}
diff --git a/src/crypto/bn/gcd.c b/src/crypto/bn/gcd.c
new file mode 100644
index 0000000..2dce296
--- /dev/null
+++ b/src/crypto/bn/gcd.c
@@ -0,0 +1,704 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+static BIGNUM *euclid(BIGNUM *a, BIGNUM *b) {
+ BIGNUM *t;
+ int shifts = 0;
+
+ /* 0 <= b <= a */
+ while (!BN_is_zero(b)) {
+ /* 0 < b <= a */
+
+ if (BN_is_odd(a)) {
+ if (BN_is_odd(b)) {
+ if (!BN_sub(a, a, b)) {
+ goto err;
+ }
+ if (!BN_rshift1(a, a)) {
+ goto err;
+ }
+ if (BN_cmp(a, b) < 0) {
+ t = a;
+ a = b;
+ b = t;
+ }
+ } else {
+ /* a odd - b even */
+ if (!BN_rshift1(b, b)) {
+ goto err;
+ }
+ if (BN_cmp(a, b) < 0) {
+ t = a;
+ a = b;
+ b = t;
+ }
+ }
+ } else {
+ /* a is even */
+ if (BN_is_odd(b)) {
+ if (!BN_rshift1(a, a)) {
+ goto err;
+ }
+ if (BN_cmp(a, b) < 0) {
+ t = a;
+ a = b;
+ b = t;
+ }
+ } else {
+ /* a even - b even */
+ if (!BN_rshift1(a, a)) {
+ goto err;
+ }
+ if (!BN_rshift1(b, b)) {
+ goto err;
+ }
+ shifts++;
+ }
+ }
+ /* 0 <= b <= a */
+ }
+
+ if (shifts) {
+ if (!BN_lshift(a, a, shifts)) {
+ goto err;
+ }
+ }
+
+ return a;
+
+err:
+ return NULL;
+}
+
+int BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx) {
+ BIGNUM *a, *b, *t;
+ int ret = 0;
+
+ BN_CTX_start(ctx);
+ a = BN_CTX_get(ctx);
+ b = BN_CTX_get(ctx);
+
+ if (a == NULL || b == NULL) {
+ goto err;
+ }
+ if (BN_copy(a, in_a) == NULL) {
+ goto err;
+ }
+ if (BN_copy(b, in_b) == NULL) {
+ goto err;
+ }
+
+ a->neg = 0;
+ b->neg = 0;
+
+ if (BN_cmp(a, b) < 0) {
+ t = a;
+ a = b;
+ b = t;
+ }
+ t = euclid(a, b);
+ if (t == NULL) {
+ goto err;
+ }
+
+ if (BN_copy(r, t) == NULL) {
+ goto err;
+ }
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+/* solves ax == 1 (mod n) */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *out, const BIGNUM *a,
+ const BIGNUM *n, BN_CTX *ctx);
+
+BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n,
+ BN_CTX *ctx) {
+ BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
+ BIGNUM *ret = NULL;
+ int sign;
+
+ if ((a->flags & BN_FLG_CONSTTIME) != 0 ||
+ (n->flags & BN_FLG_CONSTTIME) != 0) {
+ return BN_mod_inverse_no_branch(out, a, n, ctx);
+ }
+
+ BN_CTX_start(ctx);
+ A = BN_CTX_get(ctx);
+ B = BN_CTX_get(ctx);
+ X = BN_CTX_get(ctx);
+ D = BN_CTX_get(ctx);
+ M = BN_CTX_get(ctx);
+ Y = BN_CTX_get(ctx);
+ T = BN_CTX_get(ctx);
+ if (T == NULL) {
+ goto err;
+ }
+
+ if (out == NULL) {
+ R = BN_new();
+ } else {
+ R = out;
+ }
+ if (R == NULL) {
+ goto err;
+ }
+
+ BN_one(X);
+ BN_zero(Y);
+ if (BN_copy(B, a) == NULL) {
+ goto err;
+ }
+ if (BN_copy(A, n) == NULL) {
+ goto err;
+ }
+ A->neg = 0;
+ if (B->neg || (BN_ucmp(B, A) >= 0)) {
+ if (!BN_nnmod(B, B, A, ctx)) {
+ goto err;
+ }
+ }
+ sign = -1;
+ /* From B = a mod |n|, A = |n| it follows that
+ *
+ * 0 <= B < A,
+ * -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|).
+ */
+
+ if (BN_is_odd(n) && (BN_num_bits(n) <= (BN_BITS <= 32 ? 450 : 2048))) {
+ /* Binary inversion algorithm; requires odd modulus.
+ * This is faster than the general algorithm if the modulus
+ * is sufficiently small (about 400 .. 500 bits on 32-bit
+ * sytems, but much more on 64-bit systems) */
+ int shift;
+
+ while (!BN_is_zero(B)) {
+ /* 0 < B < |n|,
+ * 0 < A <= |n|,
+ * (1) -sign*X*a == B (mod |n|),
+ * (2) sign*Y*a == A (mod |n|) */
+
+ /* Now divide B by the maximum possible power of two in the integers,
+ * and divide X by the same value mod |n|.
+ * When we're done, (1) still holds. */
+ shift = 0;
+ while (!BN_is_bit_set(B, shift)) {
+ /* note that 0 < B */
+ shift++;
+
+ if (BN_is_odd(X)) {
+ if (!BN_uadd(X, X, n)) {
+ goto err;
+ }
+ }
+ /* now X is even, so we can easily divide it by two */
+ if (!BN_rshift1(X, X)) {
+ goto err;
+ }
+ }
+ if (shift > 0) {
+ if (!BN_rshift(B, B, shift)) {
+ goto err;
+ }
+ }
+
+ /* Same for A and Y. Afterwards, (2) still holds. */
+ shift = 0;
+ while (!BN_is_bit_set(A, shift)) {
+ /* note that 0 < A */
+ shift++;
+
+ if (BN_is_odd(Y)) {
+ if (!BN_uadd(Y, Y, n)) {
+ goto err;
+ }
+ }
+ /* now Y is even */
+ if (!BN_rshift1(Y, Y)) {
+ goto err;
+ }
+ }
+ if (shift > 0) {
+ if (!BN_rshift(A, A, shift)) {
+ goto err;
+ }
+ }
+
+ /* We still have (1) and (2).
+ * Both A and B are odd.
+ * The following computations ensure that
+ *
+ * 0 <= B < |n|,
+ * 0 < A < |n|,
+ * (1) -sign*X*a == B (mod |n|),
+ * (2) sign*Y*a == A (mod |n|),
+ *
+ * and that either A or B is even in the next iteration. */
+ if (BN_ucmp(B, A) >= 0) {
+ /* -sign*(X + Y)*a == B - A (mod |n|) */
+ if (!BN_uadd(X, X, Y)) {
+ goto err;
+ }
+ /* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
+ * actually makes the algorithm slower */
+ if (!BN_usub(B, B, A)) {
+ goto err;
+ }
+ } else {
+ /* sign*(X + Y)*a == A - B (mod |n|) */
+ if (!BN_uadd(Y, Y, X)) {
+ goto err;
+ }
+ /* as above, BN_mod_add_quick(Y, Y, X, n) would slow things down */
+ if (!BN_usub(A, A, B)) {
+ goto err;
+ }
+ }
+ }
+ } else {
+ /* general inversion algorithm */
+
+ while (!BN_is_zero(B)) {
+ BIGNUM *tmp;
+
+ /*
+ * 0 < B < A,
+ * (*) -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|) */
+
+ /* (D, M) := (A/B, A%B) ... */
+ if (BN_num_bits(A) == BN_num_bits(B)) {
+ if (!BN_one(D)) {
+ goto err;
+ }
+ if (!BN_sub(M, A, B)) {
+ goto err;
+ }
+ } else if (BN_num_bits(A) == BN_num_bits(B) + 1) {
+ /* A/B is 1, 2, or 3 */
+ if (!BN_lshift1(T, B)) {
+ goto err;
+ }
+ if (BN_ucmp(A, T) < 0) {
+ /* A < 2*B, so D=1 */
+ if (!BN_one(D)) {
+ goto err;
+ }
+ if (!BN_sub(M, A, B)) {
+ goto err;
+ }
+ } else {
+ /* A >= 2*B, so D=2 or D=3 */
+ if (!BN_sub(M, A, T)) {
+ goto err;
+ }
+ if (!BN_add(D, T, B)) {
+ goto err; /* use D (:= 3*B) as temp */
+ }
+ if (BN_ucmp(A, D) < 0) {
+ /* A < 3*B, so D=2 */
+ if (!BN_set_word(D, 2)) {
+ goto err;
+ }
+ /* M (= A - 2*B) already has the correct value */
+ } else {
+ /* only D=3 remains */
+ if (!BN_set_word(D, 3)) {
+ goto err;
+ }
+ /* currently M = A - 2*B, but we need M = A - 3*B */
+ if (!BN_sub(M, M, B)) {
+ goto err;
+ }
+ }
+ }
+ } else {
+ if (!BN_div(D, M, A, B, ctx)) {
+ goto err;
+ }
+ }
+
+ /* Now
+ * A = D*B + M;
+ * thus we have
+ * (**) sign*Y*a == D*B + M (mod |n|). */
+
+ tmp = A; /* keep the BIGNUM object, the value does not matter */
+
+ /* (A, B) := (B, A mod B) ... */
+ A = B;
+ B = M;
+ /* ... so we have 0 <= B < A again */
+
+ /* Since the former M is now B and the former B is now A,
+ * (**) translates into
+ * sign*Y*a == D*A + B (mod |n|),
+ * i.e.
+ * sign*Y*a - D*A == B (mod |n|).
+ * Similarly, (*) translates into
+ * -sign*X*a == A (mod |n|).
+ *
+ * Thus,
+ * sign*Y*a + D*sign*X*a == B (mod |n|),
+ * i.e.
+ * sign*(Y + D*X)*a == B (mod |n|).
+ *
+ * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
+ * -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|).
+ * Note that X and Y stay non-negative all the time. */
+
+ /* most of the time D is very small, so we can optimize tmp := D*X+Y */
+ if (BN_is_one(D)) {
+ if (!BN_add(tmp, X, Y)) {
+ goto err;
+ }
+ } else {
+ if (BN_is_word(D, 2)) {
+ if (!BN_lshift1(tmp, X)) {
+ goto err;
+ }
+ } else if (BN_is_word(D, 4)) {
+ if (!BN_lshift(tmp, X, 2)) {
+ goto err;
+ }
+ } else if (D->top == 1) {
+ if (!BN_copy(tmp, X)) {
+ goto err;
+ }
+ if (!BN_mul_word(tmp, D->d[0])) {
+ goto err;
+ }
+ } else {
+ if (!BN_mul(tmp, D, X, ctx)) {
+ goto err;
+ }
+ }
+ if (!BN_add(tmp, tmp, Y)) {
+ goto err;
+ }
+ }
+
+ M = Y; /* keep the BIGNUM object, the value does not matter */
+ Y = X;
+ X = tmp;
+ sign = -sign;
+ }
+ }
+
+ /* The while loop (Euclid's algorithm) ends when
+ * A == gcd(a,n);
+ * we have
+ * sign*Y*a == A (mod |n|),
+ * where Y is non-negative. */
+
+ if (sign < 0) {
+ if (!BN_sub(Y, n, Y)) {
+ goto err;
+ }
+ }
+ /* Now Y*a == A (mod |n|). */
+
+ if (BN_is_one(A)) {
+ /* Y*a == 1 (mod |n|) */
+ if (!Y->neg && BN_ucmp(Y, n) < 0) {
+ if (!BN_copy(R, Y)) {
+ goto err;
+ }
+ } else {
+ if (!BN_nnmod(R, Y, n, ctx)) {
+ goto err;
+ }
+ }
+ } else {
+ OPENSSL_PUT_ERROR(BN, BN_mod_inverse, BN_R_NO_INVERSE);
+ goto err;
+ }
+ ret = R;
+
+err:
+ if (ret == NULL && out == NULL) {
+ BN_free(R);
+ }
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse.
+ * It does not contain branches that may leak sensitive information. */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *out, const BIGNUM *a,
+ const BIGNUM *n, BN_CTX *ctx) {
+ BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
+ BIGNUM local_A, local_B;
+ BIGNUM *pA, *pB;
+ BIGNUM *ret = NULL;
+ int sign;
+
+ BN_CTX_start(ctx);
+ A = BN_CTX_get(ctx);
+ B = BN_CTX_get(ctx);
+ X = BN_CTX_get(ctx);
+ D = BN_CTX_get(ctx);
+ M = BN_CTX_get(ctx);
+ Y = BN_CTX_get(ctx);
+ T = BN_CTX_get(ctx);
+ if (T == NULL) {
+ goto err;
+ }
+
+ if (out == NULL) {
+ R = BN_new();
+ } else {
+ R = out;
+ }
+ if (R == NULL) {
+ goto err;
+ }
+
+ BN_one(X);
+ BN_zero(Y);
+ if (BN_copy(B, a) == NULL) {
+ goto err;
+ }
+ if (BN_copy(A, n) == NULL) {
+ goto err;
+ }
+ A->neg = 0;
+
+ if (B->neg || (BN_ucmp(B, A) >= 0)) {
+ /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+ * BN_div_no_branch will be called eventually.
+ */
+ pB = &local_B;
+ BN_with_flags(pB, B, BN_FLG_CONSTTIME);
+ if (!BN_nnmod(B, pB, A, ctx))
+ goto err;
+ }
+ sign = -1;
+ /* From B = a mod |n|, A = |n| it follows that
+ *
+ * 0 <= B < A,
+ * -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|).
+ */
+
+ while (!BN_is_zero(B)) {
+ BIGNUM *tmp;
+
+ /*
+ * 0 < B < A,
+ * (*) -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|)
+ */
+
+ /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+ * BN_div_no_branch will be called eventually.
+ */
+ pA = &local_A;
+ BN_with_flags(pA, A, BN_FLG_CONSTTIME);
+
+ /* (D, M) := (A/B, A%B) ... */
+ if (!BN_div(D, M, pA, B, ctx)) {
+ goto err;
+ }
+
+ /* Now
+ * A = D*B + M;
+ * thus we have
+ * (**) sign*Y*a == D*B + M (mod |n|).
+ */
+
+ tmp = A; /* keep the BIGNUM object, the value does not matter */
+
+ /* (A, B) := (B, A mod B) ... */
+ A = B;
+ B = M;
+ /* ... so we have 0 <= B < A again */
+
+ /* Since the former M is now B and the former B is now A,
+ * (**) translates into
+ * sign*Y*a == D*A + B (mod |n|),
+ * i.e.
+ * sign*Y*a - D*A == B (mod |n|).
+ * Similarly, (*) translates into
+ * -sign*X*a == A (mod |n|).
+ *
+ * Thus,
+ * sign*Y*a + D*sign*X*a == B (mod |n|),
+ * i.e.
+ * sign*(Y + D*X)*a == B (mod |n|).
+ *
+ * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
+ * -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|).
+ * Note that X and Y stay non-negative all the time.
+ */
+
+ if (!BN_mul(tmp, D, X, ctx)) {
+ goto err;
+ }
+ if (!BN_add(tmp, tmp, Y)) {
+ goto err;
+ }
+
+ M = Y; /* keep the BIGNUM object, the value does not matter */
+ Y = X;
+ X = tmp;
+ sign = -sign;
+ }
+
+ /*
+ * The while loop (Euclid's algorithm) ends when
+ * A == gcd(a,n);
+ * we have
+ * sign*Y*a == A (mod |n|),
+ * where Y is non-negative.
+ */
+
+ if (sign < 0) {
+ if (!BN_sub(Y, n, Y)) {
+ goto err;
+ }
+ }
+ /* Now Y*a == A (mod |n|). */
+
+ if (BN_is_one(A)) {
+ /* Y*a == 1 (mod |n|) */
+ if (!Y->neg && BN_ucmp(Y, n) < 0) {
+ if (!BN_copy(R, Y)) {
+ goto err;
+ }
+ } else {
+ if (!BN_nnmod(R, Y, n, ctx)) {
+ goto err;
+ }
+ }
+ } else {
+ OPENSSL_PUT_ERROR(BN, BN_mod_inverse_no_branch, BN_R_NO_INVERSE);
+ goto err;
+ }
+ ret = R;
+
+err:
+ if (ret == NULL && out == NULL) {
+ BN_free(R);
+ }
+
+ BN_CTX_end(ctx);
+ return ret;
+}
diff --git a/src/crypto/bn/generic.c b/src/crypto/bn/generic.c
new file mode 100644
index 0000000..224a47c
--- /dev/null
+++ b/src/crypto/bn/generic.c
@@ -0,0 +1,1120 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include "internal.h"
+
+
+/* Generic implementations of most operations are needed for:
+ * - Configurations without inline assembly.
+ * - Architectures other than x86 or x86_64.
+ * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
+#if defined(OPENSSL_NO_ASM) || \
+ (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
+ (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
+
+#if defined(OPENSSL_WINDOWS)
+#define alloca _alloca
+#else
+#include <alloca.h>
+#endif
+
+#ifdef BN_LLONG
+#define mul_add(r, a, w, c) \
+ { \
+ BN_ULLONG t; \
+ t = (BN_ULLONG)w * (a) + (r) + (c); \
+ (r) = Lw(t); \
+ (c) = Hw(t); \
+ }
+
+#define mul(r, a, w, c) \
+ { \
+ BN_ULLONG t; \
+ t = (BN_ULLONG)w * (a) + (c); \
+ (r) = Lw(t); \
+ (c) = Hw(t); \
+ }
+
+#define sqr(r0, r1, a) \
+ { \
+ BN_ULLONG t; \
+ t = (BN_ULLONG)(a) * (a); \
+ (r0) = Lw(t); \
+ (r1) = Hw(t); \
+ }
+
+#elif defined(BN_UMULT_LOHI)
+#define mul_add(r, a, w, c) \
+ { \
+ BN_ULONG high, low, ret, tmp = (a); \
+ ret = (r); \
+ BN_UMULT_LOHI(low, high, w, tmp); \
+ ret += (c); \
+ (c) = (ret < (c)) ? 1 : 0; \
+ (c) += high; \
+ ret += low; \
+ (c) += (ret < low) ? 1 : 0; \
+ (r) = ret; \
+ }
+
+#define mul(r, a, w, c) \
+ { \
+ BN_ULONG high, low, ret, ta = (a); \
+ BN_UMULT_LOHI(low, high, w, ta); \
+ ret = low + (c); \
+ (c) = high; \
+ (c) += (ret < low) ? 1 : 0; \
+ (r) = ret; \
+ }
+
+#define sqr(r0, r1, a) \
+ { \
+ BN_ULONG tmp = (a); \
+ BN_UMULT_LOHI(r0, r1, tmp, tmp); \
+ }
+
+#else
+
+/*************************************************************
+ * No long long type
+ */
+
+#define LBITS(a) ((a) & BN_MASK2l)
+#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
+#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
+
+#define LLBITS(a) ((a) & BN_MASKl)
+#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
+#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
+
+#define mul64(l, h, bl, bh) \
+ { \
+ BN_ULONG m, m1, lt, ht; \
+ \
+ lt = l; \
+ ht = h; \
+ m = (bh) * (lt); \
+ lt = (bl) * (lt); \
+ m1 = (bl) * (ht); \
+ ht = (bh) * (ht); \
+ m = (m + m1) & BN_MASK2; \
+ if (m < m1) \
+ ht += L2HBITS((BN_ULONG)1); \
+ ht += HBITS(m); \
+ m1 = L2HBITS(m); \
+ lt = (lt + m1) & BN_MASK2; \
+ if (lt < m1) \
+ ht++; \
+ (l) = lt; \
+ (h) = ht; \
+ }
+
+#define sqr64(lo, ho, in) \
+ { \
+ BN_ULONG l, h, m; \
+ \
+ h = (in); \
+ l = LBITS(h); \
+ h = HBITS(h); \
+ m = (l) * (h); \
+ l *= l; \
+ h *= h; \
+ h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
+ m = (m & BN_MASK2l) << (BN_BITS4 + 1); \
+ l = (l + m) & BN_MASK2; \
+ if (l < m) \
+ h++; \
+ (lo) = l; \
+ (ho) = h; \
+ }
+
+#define mul_add(r, a, bl, bh, c) \
+ { \
+ BN_ULONG l, h; \
+ \
+ h = (a); \
+ l = LBITS(h); \
+ h = HBITS(h); \
+ mul64(l, h, (bl), (bh)); \
+ \
+ /* non-multiply part */ \
+ l = (l + (c)) & BN_MASK2; \
+ if (l < (c)) \
+ h++; \
+ (c) = (r); \
+ l = (l + (c)) & BN_MASK2; \
+ if (l < (c)) \
+ h++; \
+ (c) = h & BN_MASK2; \
+ (r) = l; \
+ }
+
+#define mul(r, a, bl, bh, c) \
+ { \
+ BN_ULONG l, h; \
+ \
+ h = (a); \
+ l = LBITS(h); \
+ h = HBITS(h); \
+ mul64(l, h, (bl), (bh)); \
+ \
+ /* non-multiply part */ \
+ l += (c); \
+ if ((l & BN_MASK2) < (c)) \
+ h++; \
+ (c) = h & BN_MASK2; \
+ (r) = l & BN_MASK2; \
+ }
+#endif /* !BN_LLONG */
+
+#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+ BN_ULONG w) {
+ BN_ULONG c1 = 0;
+
+ assert(num >= 0);
+ if (num <= 0) {
+ return c1;
+ }
+
+ while (num & ~3) {
+ mul_add(rp[0], ap[0], w, c1);
+ mul_add(rp[1], ap[1], w, c1);
+ mul_add(rp[2], ap[2], w, c1);
+ mul_add(rp[3], ap[3], w, c1);
+ ap += 4;
+ rp += 4;
+ num -= 4;
+ }
+
+ while (num) {
+ mul_add(rp[0], ap[0], w, c1);
+ ap++;
+ rp++;
+ num--;
+ }
+
+ return c1;
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
+ BN_ULONG c1 = 0;
+
+ assert(num >= 0);
+ if (num <= 0) {
+ return c1;
+ }
+
+ while (num & ~3) {
+ mul(rp[0], ap[0], w, c1);
+ mul(rp[1], ap[1], w, c1);
+ mul(rp[2], ap[2], w, c1);
+ mul(rp[3], ap[3], w, c1);
+ ap += 4;
+ rp += 4;
+ num -= 4;
+ }
+ while (num) {
+ mul(rp[0], ap[0], w, c1);
+ ap++;
+ rp++;
+ num--;
+ }
+ return c1;
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
+ assert(n >= 0);
+ if (n <= 0) {
+ return;
+ }
+
+ while (n & ~3) {
+ sqr(r[0], r[1], a[0]);
+ sqr(r[2], r[3], a[1]);
+ sqr(r[4], r[5], a[2]);
+ sqr(r[6], r[7], a[3]);
+ a += 4;
+ r += 8;
+ n -= 4;
+ }
+ while (n) {
+ sqr(r[0], r[1], a[0]);
+ a++;
+ r += 2;
+ n--;
+ }
+}
+
+#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+ BN_ULONG w) {
+ BN_ULONG c = 0;
+ BN_ULONG bl, bh;
+
+ assert(num >= 0);
+ if (num <= 0) {
+ return (BN_ULONG)0;
+ }
+
+ bl = LBITS(w);
+ bh = HBITS(w);
+
+ while (num & ~3) {
+ mul_add(rp[0], ap[0], bl, bh, c);
+ mul_add(rp[1], ap[1], bl, bh, c);
+ mul_add(rp[2], ap[2], bl, bh, c);
+ mul_add(rp[3], ap[3], bl, bh, c);
+ ap += 4;
+ rp += 4;
+ num -= 4;
+ }
+ while (num) {
+ mul_add(rp[0], ap[0], bl, bh, c);
+ ap++;
+ rp++;
+ num--;
+ }
+ return c;
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
+ BN_ULONG carry = 0;
+ BN_ULONG bl, bh;
+
+ assert(num >= 0);
+ if (num <= 0) {
+ return (BN_ULONG)0;
+ }
+
+ bl = LBITS(w);
+ bh = HBITS(w);
+
+ while (num & ~3) {
+ mul(rp[0], ap[0], bl, bh, carry);
+ mul(rp[1], ap[1], bl, bh, carry);
+ mul(rp[2], ap[2], bl, bh, carry);
+ mul(rp[3], ap[3], bl, bh, carry);
+ ap += 4;
+ rp += 4;
+ num -= 4;
+ }
+ while (num) {
+ mul(rp[0], ap[0], bl, bh, carry);
+ ap++;
+ rp++;
+ num--;
+ }
+ return carry;
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
+ assert(n >= 0);
+ if (n <= 0) {
+ return;
+ }
+
+ while (n & ~3) {
+ sqr64(r[0], r[1], a[0]);
+ sqr64(r[2], r[3], a[1]);
+ sqr64(r[4], r[5], a[2]);
+ sqr64(r[6], r[7], a[3]);
+ a += 4;
+ r += 8;
+ n -= 4;
+ }
+ while (n) {
+ sqr64(r[0], r[1], a[0]);
+ a++;
+ r += 2;
+ n--;
+ }
+}
+
+#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
+
+#if defined(BN_LLONG)
+
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+ return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
+}
+
+#else
+
+/* Divide h,l by d and return the result. */
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+ BN_ULONG dh, dl, q, ret = 0, th, tl, t;
+ int i, count = 2;
+
+ if (d == 0) {
+ return BN_MASK2;
+ }
+
+ i = BN_num_bits_word(d);
+ assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
+
+ i = BN_BITS2 - i;
+ if (h >= d) {
+ h -= d;
+ }
+
+ if (i) {
+ d <<= i;
+ h = (h << i) | (l >> (BN_BITS2 - i));
+ l <<= i;
+ }
+ dh = (d & BN_MASK2h) >> BN_BITS4;
+ dl = (d & BN_MASK2l);
+ for (;;) {
+ if ((h >> BN_BITS4) == dh) {
+ q = BN_MASK2l;
+ } else {
+ q = h / dh;
+ }
+
+ th = q * dh;
+ tl = dl * q;
+ for (;;) {
+ t = h - th;
+ if ((t & BN_MASK2h) ||
+ ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
+ break;
+ }
+ q--;
+ th -= dh;
+ tl -= dl;
+ }
+ t = (tl >> BN_BITS4);
+ tl = (tl << BN_BITS4) & BN_MASK2h;
+ th += t;
+
+ if (l < tl) {
+ th++;
+ }
+ l -= tl;
+ if (h < th) {
+ h += d;
+ q--;
+ }
+ h -= th;
+
+ if (--count == 0) {
+ break;
+ }
+
+ ret = q << BN_BITS4;
+ h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
+ l = (l & BN_MASK2l) << BN_BITS4;
+ }
+
+ ret |= q;
+ return ret;
+}
+
+#endif /* !defined(BN_LLONG) */
+
+#ifdef BN_LLONG
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+ int n) {
+ BN_ULLONG ll = 0;
+
+ assert(n >= 0);
+ if (n <= 0) {
+ return (BN_ULONG)0;
+ }
+
+ while (n & ~3) {
+ ll += (BN_ULLONG)a[0] + b[0];
+ r[0] = (BN_ULONG)ll & BN_MASK2;
+ ll >>= BN_BITS2;
+ ll += (BN_ULLONG)a[1] + b[1];
+ r[1] = (BN_ULONG)ll & BN_MASK2;
+ ll >>= BN_BITS2;
+ ll += (BN_ULLONG)a[2] + b[2];
+ r[2] = (BN_ULONG)ll & BN_MASK2;
+ ll >>= BN_BITS2;
+ ll += (BN_ULLONG)a[3] + b[3];
+ r[3] = (BN_ULONG)ll & BN_MASK2;
+ ll >>= BN_BITS2;
+ a += 4;
+ b += 4;
+ r += 4;
+ n -= 4;
+ }
+ while (n) {
+ ll += (BN_ULLONG)a[0] + b[0];
+ r[0] = (BN_ULONG)ll & BN_MASK2;
+ ll >>= BN_BITS2;
+ a++;
+ b++;
+ r++;
+ n--;
+ }
+ return (BN_ULONG)ll;
+}
+
+#else /* !BN_LLONG */
+
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+ int n) {
+ BN_ULONG c, l, t;
+
+ assert(n >= 0);
+ if (n <= 0) {
+ return (BN_ULONG)0;
+ }
+
+ c = 0;
+ while (n & ~3) {
+ t = a[0];
+ t = (t + c) & BN_MASK2;
+ c = (t < c);
+ l = (t + b[0]) & BN_MASK2;
+ c += (l < t);
+ r[0] = l;
+ t = a[1];
+ t = (t + c) & BN_MASK2;
+ c = (t < c);
+ l = (t + b[1]) & BN_MASK2;
+ c += (l < t);
+ r[1] = l;
+ t = a[2];
+ t = (t + c) & BN_MASK2;
+ c = (t < c);
+ l = (t + b[2]) & BN_MASK2;
+ c += (l < t);
+ r[2] = l;
+ t = a[3];
+ t = (t + c) & BN_MASK2;
+ c = (t < c);
+ l = (t + b[3]) & BN_MASK2;
+ c += (l < t);
+ r[3] = l;
+ a += 4;
+ b += 4;
+ r += 4;
+ n -= 4;
+ }
+ while (n) {
+ t = a[0];
+ t = (t + c) & BN_MASK2;
+ c = (t < c);
+ l = (t + b[0]) & BN_MASK2;
+ c += (l < t);
+ r[0] = l;
+ a++;
+ b++;
+ r++;
+ n--;
+ }
+ return (BN_ULONG)c;
+}
+
+#endif /* !BN_LLONG */
+
+BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+ int n) {
+ BN_ULONG t1, t2;
+ int c = 0;
+
+ assert(n >= 0);
+ if (n <= 0) {
+ return (BN_ULONG)0;
+ }
+
+ while (n & ~3) {
+ t1 = a[0];
+ t2 = b[0];
+ r[0] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ t1 = a[1];
+ t2 = b[1];
+ r[1] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ t1 = a[2];
+ t2 = b[2];
+ r[2] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ t1 = a[3];
+ t2 = b[3];
+ r[3] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ a += 4;
+ b += 4;
+ r += 4;
+ n -= 4;
+ }
+ while (n) {
+ t1 = a[0];
+ t2 = b[0];
+ r[0] = (t1 - t2 - c) & BN_MASK2;
+ if (t1 != t2)
+ c = (t1 < t2);
+ a++;
+ b++;
+ r++;
+ n--;
+ }
+ return c;
+}
+
+/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
+/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
+/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
+/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
+
+#ifdef BN_LLONG
+
+/* Keep in mind that additions to multiplication result can not overflow,
+ * because its high half cannot be all-ones. */
+#define mul_add_c(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG hi; \
+ BN_ULLONG t = (BN_ULLONG)(a) * (b); \
+ t += c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(t); \
+ hi = (BN_ULONG)Hw(t); \
+ c1 = (c1 + hi) & BN_MASK2; \
+ if (c1 < hi) \
+ c2++; \
+ } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG hi; \
+ BN_ULLONG t = (BN_ULLONG)(a) * (b); \
+ BN_ULLONG tt = t + c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(tt); \
+ hi = (BN_ULONG)Hw(tt); \
+ c1 = (c1 + hi) & BN_MASK2; \
+ if (c1 < hi) \
+ c2++; \
+ t += c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(t); \
+ hi = (BN_ULONG)Hw(t); \
+ c1 = (c1 + hi) & BN_MASK2; \
+ if (c1 < hi) \
+ c2++; \
+ } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+ do { \
+ BN_ULONG hi; \
+ BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
+ t += c0; /* no carry */ \
+ c0 = (BN_ULONG)Lw(t); \
+ hi = (BN_ULONG)Hw(t); \
+ c1 = (c1 + hi) & BN_MASK2; \
+ if (c1 < hi) \
+ c2++; \
+ } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+#elif defined(BN_UMULT_LOHI)
+
+/* Keep in mind that additions to hi can not overflow, because the high word of
+ * a multiplication result cannot be all-ones. */
+#define mul_add_c(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo, hi, ta, tb); \
+ c0 += lo; \
+ hi += (c0 < lo) ? 1 : 0; \
+ c1 += hi; \
+ c2 += (c1 < hi) ? 1 : 0; \
+ } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG ta = (a), tb = (b); \
+ BN_ULONG lo, hi, tt; \
+ BN_UMULT_LOHI(lo, hi, ta, tb); \
+ c0 += lo; \
+ tt = hi + ((c0 < lo) ? 1 : 0); \
+ c1 += tt; \
+ c2 += (c1 < tt) ? 1 : 0; \
+ c0 += lo; \
+ hi += (c0 < lo) ? 1 : 0; \
+ c1 += hi; \
+ c2 += (c1 < hi) ? 1 : 0; \
+ } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+ do { \
+ BN_ULONG ta = (a)[i]; \
+ BN_ULONG lo, hi; \
+ BN_UMULT_LOHI(lo, hi, ta, ta); \
+ c0 += lo; \
+ hi += (c0 < lo) ? 1 : 0; \
+ c1 += hi; \
+ c2 += (c1 < hi) ? 1 : 0; \
+ } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+#else /* !BN_LLONG */
+
+/* Keep in mind that additions to hi can not overflow, because
+ * the high word of a multiplication result cannot be all-ones. */
+
+#define mul_add_c(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG lo = LBITS(a), hi = HBITS(a); \
+ BN_ULONG bl = LBITS(b), bh = HBITS(b); \
+ mul64(lo, hi, bl, bh); \
+ c0 = (c0 + lo) & BN_MASK2; \
+ if (c0 < lo) \
+ hi++; \
+ c1 = (c1 + hi) & BN_MASK2; \
+ if (c1 < hi) \
+ c2++; \
+ } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+ do { \
+ BN_ULONG tt; \
+ BN_ULONG lo = LBITS(a), hi = HBITS(a); \
+ BN_ULONG bl = LBITS(b), bh = HBITS(b); \
+ mul64(lo, hi, bl, bh); \
+ tt = hi; \
+ c0 = (c0 + lo) & BN_MASK2; \
+ if (c0 < lo) \
+ tt++; \
+ c1 = (c1 + tt) & BN_MASK2; \
+ if (c1 < tt) \
+ c2++; \
+ c0 = (c0 + lo) & BN_MASK2; \
+ if (c0 < lo) \
+ hi++; \
+ c1 = (c1 + hi) & BN_MASK2; \
+ if (c1 < hi) \
+ c2++; \
+ } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+ do { \
+ BN_ULONG lo, hi; \
+ sqr64(lo, hi, (a)[i]); \
+ c0 = (c0 + lo) & BN_MASK2; \
+ if (c0 < lo) \
+ hi++; \
+ c1 = (c1 + hi) & BN_MASK2; \
+ if (c1 < hi) \
+ c2++; \
+ } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+#endif /* !BN_LLONG */
+
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ mul_add_c(a[0], b[0], c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[1], c2, c3, c1);
+ mul_add_c(a[1], b[0], c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[0], c3, c1, c2);
+ mul_add_c(a[1], b[1], c3, c1, c2);
+ mul_add_c(a[0], b[2], c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ mul_add_c(a[0], b[3], c1, c2, c3);
+ mul_add_c(a[1], b[2], c1, c2, c3);
+ mul_add_c(a[2], b[1], c1, c2, c3);
+ mul_add_c(a[3], b[0], c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ mul_add_c(a[4], b[0], c2, c3, c1);
+ mul_add_c(a[3], b[1], c2, c3, c1);
+ mul_add_c(a[2], b[2], c2, c3, c1);
+ mul_add_c(a[1], b[3], c2, c3, c1);
+ mul_add_c(a[0], b[4], c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ mul_add_c(a[0], b[5], c3, c1, c2);
+ mul_add_c(a[1], b[4], c3, c1, c2);
+ mul_add_c(a[2], b[3], c3, c1, c2);
+ mul_add_c(a[3], b[2], c3, c1, c2);
+ mul_add_c(a[4], b[1], c3, c1, c2);
+ mul_add_c(a[5], b[0], c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ mul_add_c(a[6], b[0], c1, c2, c3);
+ mul_add_c(a[5], b[1], c1, c2, c3);
+ mul_add_c(a[4], b[2], c1, c2, c3);
+ mul_add_c(a[3], b[3], c1, c2, c3);
+ mul_add_c(a[2], b[4], c1, c2, c3);
+ mul_add_c(a[1], b[5], c1, c2, c3);
+ mul_add_c(a[0], b[6], c1, c2, c3);
+ r[6] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[7], c2, c3, c1);
+ mul_add_c(a[1], b[6], c2, c3, c1);
+ mul_add_c(a[2], b[5], c2, c3, c1);
+ mul_add_c(a[3], b[4], c2, c3, c1);
+ mul_add_c(a[4], b[3], c2, c3, c1);
+ mul_add_c(a[5], b[2], c2, c3, c1);
+ mul_add_c(a[6], b[1], c2, c3, c1);
+ mul_add_c(a[7], b[0], c2, c3, c1);
+ r[7] = c2;
+ c2 = 0;
+ mul_add_c(a[7], b[1], c3, c1, c2);
+ mul_add_c(a[6], b[2], c3, c1, c2);
+ mul_add_c(a[5], b[3], c3, c1, c2);
+ mul_add_c(a[4], b[4], c3, c1, c2);
+ mul_add_c(a[3], b[5], c3, c1, c2);
+ mul_add_c(a[2], b[6], c3, c1, c2);
+ mul_add_c(a[1], b[7], c3, c1, c2);
+ r[8] = c3;
+ c3 = 0;
+ mul_add_c(a[2], b[7], c1, c2, c3);
+ mul_add_c(a[3], b[6], c1, c2, c3);
+ mul_add_c(a[4], b[5], c1, c2, c3);
+ mul_add_c(a[5], b[4], c1, c2, c3);
+ mul_add_c(a[6], b[3], c1, c2, c3);
+ mul_add_c(a[7], b[2], c1, c2, c3);
+ r[9] = c1;
+ c1 = 0;
+ mul_add_c(a[7], b[3], c2, c3, c1);
+ mul_add_c(a[6], b[4], c2, c3, c1);
+ mul_add_c(a[5], b[5], c2, c3, c1);
+ mul_add_c(a[4], b[6], c2, c3, c1);
+ mul_add_c(a[3], b[7], c2, c3, c1);
+ r[10] = c2;
+ c2 = 0;
+ mul_add_c(a[4], b[7], c3, c1, c2);
+ mul_add_c(a[5], b[6], c3, c1, c2);
+ mul_add_c(a[6], b[5], c3, c1, c2);
+ mul_add_c(a[7], b[4], c3, c1, c2);
+ r[11] = c3;
+ c3 = 0;
+ mul_add_c(a[7], b[5], c1, c2, c3);
+ mul_add_c(a[6], b[6], c1, c2, c3);
+ mul_add_c(a[5], b[7], c1, c2, c3);
+ r[12] = c1;
+ c1 = 0;
+ mul_add_c(a[6], b[7], c2, c3, c1);
+ mul_add_c(a[7], b[6], c2, c3, c1);
+ r[13] = c2;
+ c2 = 0;
+ mul_add_c(a[7], b[7], c3, c1, c2);
+ r[14] = c3;
+ r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ mul_add_c(a[0], b[0], c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ mul_add_c(a[0], b[1], c2, c3, c1);
+ mul_add_c(a[1], b[0], c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[0], c3, c1, c2);
+ mul_add_c(a[1], b[1], c3, c1, c2);
+ mul_add_c(a[0], b[2], c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ mul_add_c(a[0], b[3], c1, c2, c3);
+ mul_add_c(a[1], b[2], c1, c2, c3);
+ mul_add_c(a[2], b[1], c1, c2, c3);
+ mul_add_c(a[3], b[0], c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ mul_add_c(a[3], b[1], c2, c3, c1);
+ mul_add_c(a[2], b[2], c2, c3, c1);
+ mul_add_c(a[1], b[3], c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ mul_add_c(a[2], b[3], c3, c1, c2);
+ mul_add_c(a[3], b[2], c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ mul_add_c(a[3], b[3], c1, c2, c3);
+ r[6] = c1;
+ r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ sqr_add_c(a, 0, c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 1, 0, c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ sqr_add_c(a, 1, c3, c1, c2);
+ sqr_add_c2(a, 2, 0, c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 3, 0, c1, c2, c3);
+ sqr_add_c2(a, 2, 1, c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ sqr_add_c(a, 2, c2, c3, c1);
+ sqr_add_c2(a, 3, 1, c2, c3, c1);
+ sqr_add_c2(a, 4, 0, c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 5, 0, c3, c1, c2);
+ sqr_add_c2(a, 4, 1, c3, c1, c2);
+ sqr_add_c2(a, 3, 2, c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ sqr_add_c(a, 3, c1, c2, c3);
+ sqr_add_c2(a, 4, 2, c1, c2, c3);
+ sqr_add_c2(a, 5, 1, c1, c2, c3);
+ sqr_add_c2(a, 6, 0, c1, c2, c3);
+ r[6] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 7, 0, c2, c3, c1);
+ sqr_add_c2(a, 6, 1, c2, c3, c1);
+ sqr_add_c2(a, 5, 2, c2, c3, c1);
+ sqr_add_c2(a, 4, 3, c2, c3, c1);
+ r[7] = c2;
+ c2 = 0;
+ sqr_add_c(a, 4, c3, c1, c2);
+ sqr_add_c2(a, 5, 3, c3, c1, c2);
+ sqr_add_c2(a, 6, 2, c3, c1, c2);
+ sqr_add_c2(a, 7, 1, c3, c1, c2);
+ r[8] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 7, 2, c1, c2, c3);
+ sqr_add_c2(a, 6, 3, c1, c2, c3);
+ sqr_add_c2(a, 5, 4, c1, c2, c3);
+ r[9] = c1;
+ c1 = 0;
+ sqr_add_c(a, 5, c2, c3, c1);
+ sqr_add_c2(a, 6, 4, c2, c3, c1);
+ sqr_add_c2(a, 7, 3, c2, c3, c1);
+ r[10] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 7, 4, c3, c1, c2);
+ sqr_add_c2(a, 6, 5, c3, c1, c2);
+ r[11] = c3;
+ c3 = 0;
+ sqr_add_c(a, 6, c1, c2, c3);
+ sqr_add_c2(a, 7, 5, c1, c2, c3);
+ r[12] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 7, 6, c2, c3, c1);
+ r[13] = c2;
+ c2 = 0;
+ sqr_add_c(a, 7, c3, c1, c2);
+ r[14] = c3;
+ r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
+ BN_ULONG c1, c2, c3;
+
+ c1 = 0;
+ c2 = 0;
+ c3 = 0;
+ sqr_add_c(a, 0, c1, c2, c3);
+ r[0] = c1;
+ c1 = 0;
+ sqr_add_c2(a, 1, 0, c2, c3, c1);
+ r[1] = c2;
+ c2 = 0;
+ sqr_add_c(a, 1, c3, c1, c2);
+ sqr_add_c2(a, 2, 0, c3, c1, c2);
+ r[2] = c3;
+ c3 = 0;
+ sqr_add_c2(a, 3, 0, c1, c2, c3);
+ sqr_add_c2(a, 2, 1, c1, c2, c3);
+ r[3] = c1;
+ c1 = 0;
+ sqr_add_c(a, 2, c2, c3, c1);
+ sqr_add_c2(a, 3, 1, c2, c3, c1);
+ r[4] = c2;
+ c2 = 0;
+ sqr_add_c2(a, 3, 2, c3, c1, c2);
+ r[5] = c3;
+ c3 = 0;
+ sqr_add_c(a, 3, c1, c2, c3);
+ r[6] = c1;
+ r[7] = c2;
+}
+
+#if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
+/* This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this routine was
+ * observed to give 40% faster rsa1024 private key operations and 10%
+ * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
+ * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
+ * reference implementation, one to be used as starting point for
+ * platform-specific assembler. Mentioned numbers apply to compiler
+ * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
+ * can vary not only from platform to platform, but even for compiler
+ * versions. Assembler vs. assembler improvement coefficients can
+ * [and are known to] differ and are to be documented elsewhere. */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0p, int num) {
+ BN_ULONG c0, c1, ml, *tp, n0;
+#ifdef mul64
+ BN_ULONG mh;
+#endif
+ volatile BN_ULONG *vp;
+ int i = 0, j;
+
+#if 0 /* template for platform-specific implementation */
+ if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
+#endif
+ vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
+
+ n0 = *n0p;
+
+ c0 = 0;
+ ml = bp[0];
+#ifdef mul64
+ mh = HBITS(ml);
+ ml = LBITS(ml);
+ for (j = 0; j < num; ++j)
+ mul(tp[j], ap[j], ml, mh, c0);
+#else
+ for (j = 0; j < num; ++j)
+ mul(tp[j], ap[j], ml, c0);
+#endif
+
+ tp[num] = c0;
+ tp[num + 1] = 0;
+ goto enter;
+
+ for (i = 0; i < num; i++) {
+ c0 = 0;
+ ml = bp[i];
+#ifdef mul64
+ mh = HBITS(ml);
+ ml = LBITS(ml);
+ for (j = 0; j < num; ++j)
+ mul_add(tp[j], ap[j], ml, mh, c0);
+#else
+ for (j = 0; j < num; ++j)
+ mul_add(tp[j], ap[j], ml, c0);
+#endif
+ c1 = (tp[num] + c0) & BN_MASK2;
+ tp[num] = c1;
+ tp[num + 1] = (c1 < c0 ? 1 : 0);
+ enter:
+ c1 = tp[0];
+ ml = (c1 * n0) & BN_MASK2;
+ c0 = 0;
+#ifdef mul64
+ mh = HBITS(ml);
+ ml = LBITS(ml);
+ mul_add(c1, np[0], ml, mh, c0);
+#else
+ mul_add(c1, ml, np[0], c0);
+#endif
+ for (j = 1; j < num; j++) {
+ c1 = tp[j];
+#ifdef mul64
+ mul_add(c1, np[j], ml, mh, c0);
+#else
+ mul_add(c1, ml, np[j], c0);
+#endif
+ tp[j - 1] = c1 & BN_MASK2;
+ }
+ c1 = (tp[num] + c0) & BN_MASK2;
+ tp[num - 1] = c1;
+ tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
+ }
+
+ if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
+ c0 = bn_sub_words(rp, tp, np, num);
+ if (tp[num] != 0 || c0 == 0) {
+ for (i = 0; i < num + 2; i++)
+ vp[i] = 0;
+ return 1;
+ }
+ }
+ for (i = 0; i < num; i++)
+ rp[i] = tp[i], vp[i] = 0;
+ vp[num] = 0;
+ vp[num + 1] = 0;
+ return 1;
+}
+#endif
+
+#endif
diff --git a/src/crypto/bn/internal.h b/src/crypto/bn/internal.h
new file mode 100644
index 0000000..d421cf3
--- /dev/null
+++ b/src/crypto/bn/internal.h
@@ -0,0 +1,297 @@
+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the Eric Young open source
+ * license provided above.
+ *
+ * The binary polynomial arithmetic software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#ifndef OPENSSL_HEADER_BN_INTERNAL_H
+#define OPENSSL_HEADER_BN_INTERNAL_H
+
+#include <openssl/base.h>
+
+#include <inttypes.h>
+
+#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && _MSC_VER >= 1400
+#include <intrin.h>
+#pragma intrinsic(__umulh, _umul128)
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* bn_expand acts the same as |BN_wexpand|, but takes a number of bits rather
+ * than a number of words. */
+BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
+
+#if defined(OPENSSL_64_BIT)
+
+#if !defined(_MSC_VER)
+/* MSVC doesn't support two-word integers on 64-bit. */
+#define BN_LLONG __int128_t
+#define BN_ULLONG __uint128_t
+#endif
+
+#define BN_BITS 128
+#define BN_BITS2 64
+#define BN_BYTES 8
+#define BN_BITS4 32
+#define BN_MASK (0xffffffffffffffffffffffffffffffffLL)
+#define BN_MASK2 (0xffffffffffffffffL)
+#define BN_MASK2l (0xffffffffL)
+#define BN_MASK2h (0xffffffff00000000L)
+#define BN_MASK2h1 (0xffffffff80000000L)
+#define BN_TBIT (0x8000000000000000L)
+#define BN_DEC_CONV (10000000000000000000UL)
+#define BN_DEC_FMT1 "%" PRIu64
+#define BN_DEC_FMT2 "%019" PRIu64
+#define BN_DEC_NUM 19
+#define BN_HEX_FMT1 "%" PRIx64
+
+#elif defined(OPENSSL_32_BIT)
+
+#define BN_LLONG int64_t
+#define BN_ULLONG uint64_t
+#define BN_MASK (0xffffffffffffffffLL)
+#define BN_BITS 64
+#define BN_BITS2 32
+#define BN_BYTES 4
+#define BN_BITS4 16
+#define BN_MASK2 (0xffffffffL)
+#define BN_MASK2l (0xffff)
+#define BN_MASK2h1 (0xffff8000L)
+#define BN_MASK2h (0xffff0000L)
+#define BN_TBIT (0x80000000L)
+#define BN_DEC_CONV (1000000000L)
+#define BN_DEC_FMT1 "%" PRIu32
+#define BN_DEC_FMT2 "%09" PRIu32
+#define BN_DEC_NUM 9
+#define BN_HEX_FMT1 "%" PRIx32
+
+#else
+#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
+#endif
+
+/* Pentium pro 16,16,16,32,64 */
+/* Alpha 16,16,16,16.64 */
+#define BN_MULL_SIZE_NORMAL (16) /* 32 */
+#define BN_MUL_RECURSIVE_SIZE_NORMAL (16) /* 32 less than */
+#define BN_SQR_RECURSIVE_SIZE_NORMAL (16) /* 32 */
+#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
+#define BN_MONT_CTX_SET_SIZE_WORD (64) /* 32 */
+
+#if defined(BN_LLONG)
+#define Lw(t) (((BN_ULONG)(t))&BN_MASK2)
+#define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
+#endif
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
+
+/* bn_cmp_words returns a value less than, equal to or greater than zero if
+ * the, length |n|, array |a| is less than, equal to or greater than |b|. */
+int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
+
+/* bn_cmp_words returns a value less than, equal to or greater than zero if the
+ * array |a| is less than, equal to or greater than |b|. The arrays can be of
+ * different lengths: |cl| gives the minimum of the two lengths and |dl| gives
+ * the length of |a| minus the length of |b|. */
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
+
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, int num);
+
+#if !defined(BN_LLONG)
+
+#define LBITS(a) ((a) & BN_MASK2l)
+#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
+#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
+
+#define LLBITS(a) ((a) & BN_MASKl)
+#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
+#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
+
+#define mul64(l, h, bl, bh) \
+ { \
+ BN_ULONG m, m1, lt, ht; \
+ \
+ lt = l; \
+ ht = h; \
+ m = (bh) * (lt); \
+ lt = (bl) * (lt); \
+ m1 = (bl) * (ht); \
+ ht = (bh) * (ht); \
+ m = (m + m1) & BN_MASK2; \
+ if (m < m1) \
+ ht += L2HBITS((BN_ULONG)1); \
+ ht += HBITS(m); \
+ m1 = L2HBITS(m); \
+ lt = (lt + m1) & BN_MASK2; \
+ if (lt < m1) \
+ ht++; \
+ (l) = lt; \
+ (h) = ht; \
+ }
+
+#endif /* !defined(BN_LLONG) */
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+# if defined(__GNUC__) && __GNUC__ >= 2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret,discard; \
+ __asm__ ("mulq %3" \
+ : "=a"(discard),"=d"(ret) \
+ : "a"(a), "g"(b) \
+ : "cc"); \
+ ret; })
+# define BN_UMULT_LOHI(low,high,a,b) \
+ __asm__ ("mulq %3" \
+ : "=a"(low),"=d"(high) \
+ : "a"(a),"g"(b) \
+ : "cc");
+# elif defined(_MSC_VER) && _MSC_VER >= 1400
+# define BN_UMULT_HIGH(a, b) __umulh((a), (b))
+# define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
+# endif
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+# if defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ __asm__ ("umulh %0,%1,%2" \
+ : "=r"(ret) \
+ : "r"(a), "r"(b)); \
+ ret; })
+# endif
+#endif
+
+
+#if defined(__cplusplus)
+} /* extern C */
+#endif
+
+#endif /* OPENSSL_HEADER_BN_INTERNAL_H */
diff --git a/src/crypto/bn/kronecker.c b/src/crypto/bn/kronecker.c
new file mode 100644
index 0000000..23ef79a
--- /dev/null
+++ b/src/crypto/bn/kronecker.c
@@ -0,0 +1,175 @@
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include "internal.h"
+
+
+/* least significant word */
+#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
+
+/* Returns -2 for errors because both -1 and 0 are valid results. */
+int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+ int i;
+ int ret = -2;
+ BIGNUM *A, *B, *tmp;
+ /* In 'tab', only odd-indexed entries are relevant:
+ * For any odd BIGNUM n,
+ * tab[BN_lsw(n) & 7]
+ * is $(-1)^{(n^2-1)/8}$ (using TeX notation).
+ * Note that the sign of n does not matter. */
+ static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1};
+
+ BN_CTX_start(ctx);
+ A = BN_CTX_get(ctx);
+ B = BN_CTX_get(ctx);
+ if (B == NULL) {
+ goto end;
+ }
+
+ if (!BN_copy(A, a) ||
+ !BN_copy(B, b)) {
+ goto end;
+ }
+
+ /* Kronecker symbol, imlemented according to Henri Cohen,
+ * "A Course in Computational Algebraic Number Theory"
+ * (algorithm 1.4.10). */
+
+ /* Cohen's step 1: */
+
+ if (BN_is_zero(B)) {
+ ret = BN_abs_is_word(A, 1);
+ goto end;
+ }
+
+ /* Cohen's step 2: */
+
+ if (!BN_is_odd(A) && !BN_is_odd(B)) {
+ ret = 0;
+ goto end;
+ }
+
+ /* now B is non-zero */
+ i = 0;
+ while (!BN_is_bit_set(B, i)) {
+ i++;
+ }
+ if (!BN_rshift(B, B, i)) {
+ goto end;
+ }
+ if (i & 1) {
+ /* i is odd */
+ /* (thus B was even, thus A must be odd!) */
+
+ /* set 'ret' to $(-1)^{(A^2-1)/8}$ */
+ ret = tab[BN_lsw(A) & 7];
+ } else {
+ /* i is even */
+ ret = 1;
+ }
+
+ if (B->neg) {
+ B->neg = 0;
+ if (A->neg) {
+ ret = -ret;
+ }
+ }
+
+ /* now B is positive and odd, so what remains to be done is to compute the
+ * Jacobi symbol (A/B) and multiply it by 'ret' */
+
+ while (1) {
+ /* Cohen's step 3: */
+
+ /* B is positive and odd */
+ if (BN_is_zero(A)) {
+ ret = BN_is_one(B) ? ret : 0;
+ goto end;
+ }
+
+ /* now A is non-zero */
+ i = 0;
+ while (!BN_is_bit_set(A, i)) {
+ i++;
+ }
+ if (!BN_rshift(A, A, i)) {
+ goto end;
+ }
+ if (i & 1) {
+ /* i is odd */
+ /* multiply 'ret' by $(-1)^{(B^2-1)/8}$ */
+ ret = ret * tab[BN_lsw(B) & 7];
+ }
+
+ /* Cohen's step 4: */
+ /* multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ */
+ if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2) {
+ ret = -ret;
+ }
+
+ /* (A, B) := (B mod |A|, |A|) */
+ if (!BN_nnmod(B, B, A, ctx)) {
+ ret = -2;
+ goto end;
+ }
+ tmp = A;
+ A = B;
+ B = tmp;
+ tmp->neg = 0;
+ }
+
+end:
+ BN_CTX_end(ctx);
+ return ret;
+}
diff --git a/src/crypto/bn/montgomery.c b/src/crypto/bn/montgomery.c
new file mode 100644
index 0000000..65e177c
--- /dev/null
+++ b/src/crypto/bn/montgomery.c
@@ -0,0 +1,571 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+#include <openssl/thread.h>
+
+#include "internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) && \
+ (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
+#define OPENSSL_BN_ASM_MONT
+#endif
+
+BN_MONT_CTX *BN_MONT_CTX_new(void) {
+ BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
+
+ if (ret == NULL) {
+ return NULL;
+ }
+
+ BN_MONT_CTX_init(ret);
+ ret->flags = BN_FLG_MALLOCED;
+ return ret;
+}
+
+void BN_MONT_CTX_init(BN_MONT_CTX *mont) {
+ memset(mont, 0, sizeof(BN_MONT_CTX));
+ BN_init(&mont->RR);
+ BN_init(&mont->N);
+ BN_init(&mont->Ni);
+}
+
+void BN_MONT_CTX_free(BN_MONT_CTX *mont) {
+ if (mont == NULL) {
+ return;
+ }
+
+ BN_free(&mont->RR);
+ BN_free(&mont->N);
+ BN_free(&mont->Ni);
+ if (mont->flags & BN_FLG_MALLOCED) {
+ OPENSSL_free(mont);
+ }
+}
+
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from) {
+ if (to == from) {
+ return to;
+ }
+
+ if (!BN_copy(&to->RR, &from->RR) ||
+ !BN_copy(&to->N, &from->N) ||
+ !BN_copy(&to->Ni, &from->Ni)) {
+ return NULL;
+ }
+ to->ri = from->ri;
+ to->n0[0] = from->n0[0];
+ to->n0[1] = from->n0[1];
+ return to;
+}
+
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+ int ret = 0;
+ BIGNUM *Ri, *R;
+ BIGNUM tmod;
+ BN_ULONG buf[2];
+
+ BN_CTX_start(ctx);
+ Ri = BN_CTX_get(ctx);
+ if (Ri == NULL) {
+ goto err;
+ }
+ R = &mont->RR; /* grab RR as a temp */
+ if (!BN_copy(&mont->N, mod)) {
+ goto err; /* Set N */
+ }
+ mont->N.neg = 0;
+
+ BN_init(&tmod);
+ tmod.d = buf;
+ tmod.dmax = 2;
+ tmod.neg = 0;
+
+ mont->ri = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
+
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2 <= 32)
+ /* Only certain BN_BITS2<=32 platforms actually make use of
+ * n0[1], and we could use the #else case (with a shorter R
+ * value) for the others. However, currently only the assembler
+ * files do know which is which. */
+
+ BN_zero(R);
+ if (!BN_set_bit(R, 2 * BN_BITS2)) {
+ goto err;
+ }
+
+ tmod.top = 0;
+ if ((buf[0] = mod->d[0])) {
+ tmod.top = 1;
+ }
+ if ((buf[1] = mod->top > 1 ? mod->d[1] : 0)) {
+ tmod.top = 2;
+ }
+
+ if (BN_mod_inverse(Ri, R, &tmod, ctx) == NULL) {
+ goto err;
+ }
+ if (!BN_lshift(Ri, Ri, 2 * BN_BITS2)) {
+ goto err; /* R*Ri */
+ }
+ if (!BN_is_zero(Ri)) {
+ if (!BN_sub_word(Ri, 1)) {
+ goto err;
+ }
+ } else {
+ /* if N mod word size == 1 */
+ if (bn_expand(Ri, (int)sizeof(BN_ULONG) * 2) == NULL) {
+ goto err;
+ }
+ /* Ri-- (mod double word size) */
+ Ri->neg = 0;
+ Ri->d[0] = BN_MASK2;
+ Ri->d[1] = BN_MASK2;
+ Ri->top = 2;
+ }
+
+ if (!BN_div(Ri, NULL, Ri, &tmod, ctx)) {
+ goto err;
+ }
+ /* Ni = (R*Ri-1)/N,
+ * keep only couple of least significant words: */
+ mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+ mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+#else
+ BN_zero(R);
+ if (!BN_set_bit(R, BN_BITS2)) {
+ goto err; /* R */
+ }
+
+ buf[0] = mod->d[0]; /* tmod = N mod word size */
+ buf[1] = 0;
+ tmod.top = buf[0] != 0 ? 1 : 0;
+ /* Ri = R^-1 mod N*/
+ if (BN_mod_inverse(Ri, R, &tmod, ctx) == NULL) {
+ goto err;
+ }
+ if (!BN_lshift(Ri, Ri, BN_BITS2)) {
+ goto err; /* R*Ri */
+ }
+ if (!BN_is_zero(Ri)) {
+ if (!BN_sub_word(Ri, 1)) {
+ goto err;
+ }
+ } else {
+ /* if N mod word size == 1 */
+ if (!BN_set_word(Ri, BN_MASK2)) {
+ goto err; /* Ri-- (mod word size) */
+ }
+ }
+ if (!BN_div(Ri, NULL, Ri, &tmod, ctx)) {
+ goto err;
+ }
+ /* Ni = (R*Ri-1)/N,
+ * keep only least significant word: */
+ mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+ mont->n0[1] = 0;
+#endif
+
+ /* setup RR for conversions */
+ BN_zero(&(mont->RR));
+ if (!BN_set_bit(&(mont->RR), mont->ri * 2)) {
+ goto err;
+ }
+ if (!BN_mod(&(mont->RR), &(mont->RR), &(mont->N), ctx)) {
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+ const BIGNUM *mod, BN_CTX *ctx) {
+ BN_MONT_CTX *ret;
+
+ CRYPTO_r_lock(lock);
+ ret = *pmont;
+ CRYPTO_r_unlock(lock);
+ if (ret) {
+ return ret;
+ }
+
+ /* We don't want to serialise globally while doing our lazy-init math in
+ * BN_MONT_CTX_set. That punishes threads that are doing independent
+ * things. Instead, punish the case where more than one thread tries to
+ * lazy-init the same 'pmont', by having each do the lazy-init math work
+ * independently and only use the one from the thread that wins the race
+ * (the losers throw away the work they've done). */
+ ret = BN_MONT_CTX_new();
+ if (!ret) {
+ return NULL;
+ }
+ if (!BN_MONT_CTX_set(ret, mod, ctx)) {
+ BN_MONT_CTX_free(ret);
+ return NULL;
+ }
+
+ /* The locked compare-and-set, after the local work is done. */
+ CRYPTO_w_lock(lock);
+ if (*pmont) {
+ BN_MONT_CTX_free(ret);
+ ret = *pmont;
+ } else {
+ *pmont = ret;
+ }
+
+ CRYPTO_w_unlock(lock);
+
+ return ret;
+}
+
+int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+ BN_CTX *ctx) {
+ return BN_mod_mul_montgomery(ret, a, &mont->RR, mont, ctx);
+}
+
+#if 0
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
+ const BN_MONT_CTX *mont) {
+ const BIGNUM *n;
+ BN_ULONG *ap, *np, *rp, n0, v, carry;
+ int nl, max, i;
+
+ n = &mont->N;
+ nl = n->top;
+ if (nl == 0) {
+ ret->top = 0;
+ return 1;
+ }
+
+ max = (2 * nl); /* carry is stored separately */
+ if (bn_wexpand(r, max) == NULL) {
+ return 0;
+ }
+
+ r->neg ^= n->neg;
+ np = n->d;
+ rp = r->d;
+
+ /* clear the top words of T */
+ if (max > r->top) {
+ memset(&rp[r->top], 0, (max - r->top) * sizeof(BN_ULONG));
+ }
+
+ r->top = max;
+ n0 = mont->n0[0];
+
+ for (carry = 0, i = 0; i < nl; i++, rp++) {
+ v = bn_mul_add_words(rp, np, nl, (rp[0] * n0) & BN_MASK2);
+ v = (v + carry + rp[nl]) & BN_MASK2;
+ carry |= (v != rp[nl]);
+ carry &= (v <= rp[nl]);
+ rp[nl] = v;
+ }
+
+ if (bn_wexpand(ret, nl) == NULL) {
+ return 0;
+ }
+ ret->top = nl;
+ ret->neg = r->neg;
+
+ rp = ret->d;
+ ap = &(r->d[nl]);
+
+ {
+ BN_ULONG *nrp;
+ size_t m;
+
+ v = bn_sub_words(rp, ap, np, nl) - carry;
+ /* if subtraction result is real, then trick unconditional memcpy below to
+ * perform in-place "refresh" instead of actual copy. */
+ m = (0 - (size_t)v);
+ nrp = (BN_ULONG *)(((intptr_t)rp & ~m) | ((intptr_t)ap & m));
+
+ for (i = 0, nl -= 4; i < nl; i += 4) {
+ BN_ULONG t1, t2, t3, t4;
+
+ t1 = nrp[i + 0];
+ t2 = nrp[i + 1];
+ t3 = nrp[i + 2];
+ ap[i + 0] = 0;
+ t4 = nrp[i + 3];
+ ap[i + 1] = 0;
+ rp[i + 0] = t1;
+ ap[i + 2] = 0;
+ rp[i + 1] = t2;
+ ap[i + 3] = 0;
+ rp[i + 2] = t3;
+ rp[i + 3] = t4;
+ }
+
+ for (nl += 4; i < nl; i++) {
+ rp[i] = nrp[i], ap[i] = 0;
+ }
+ }
+
+ bn_correct_top(r);
+ bn_correct_top(ret);
+
+ return 1;
+}
+#endif
+
+#define PTR_SIZE_INT size_t
+
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, const BN_MONT_CTX *mont)
+ {
+ BIGNUM *n;
+ BN_ULONG *ap,*np,*rp,n0,v,carry;
+ int nl,max,i;
+
+ n= (BIGNUM*) &(mont->N);
+ nl=n->top;
+ if (nl == 0) { ret->top=0; return(1); }
+
+ max=(2*nl); /* carry is stored separately */
+ if (bn_wexpand(r,max) == NULL) return(0);
+
+ r->neg^=n->neg;
+ np=n->d;
+ rp=r->d;
+
+ /* clear the top words of T */
+#if 1
+ for (i=r->top; i<max; i++) /* memset? XXX */
+ rp[i]=0;
+#else
+ memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
+#endif
+
+ r->top=max;
+ n0=mont->n0[0];
+
+ for (carry=0, i=0; i<nl; i++, rp++)
+ {
+ v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
+ v = (v+carry+rp[nl])&BN_MASK2;
+ carry |= (v != rp[nl]);
+ carry &= (v <= rp[nl]);
+ rp[nl]=v;
+ }
+
+ if (bn_wexpand(ret,nl) == NULL) return(0);
+ ret->top=nl;
+ ret->neg=r->neg;
+
+ rp=ret->d;
+ ap=&(r->d[nl]);
+
+ {
+ BN_ULONG *nrp;
+ size_t m;
+
+ v=bn_sub_words(rp,ap,np,nl)-carry;
+ /* if subtraction result is real, then
+ * trick unconditional memcpy below to perform in-place
+ * "refresh" instead of actual copy. */
+ m=(0-(size_t)v);
+ nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
+
+ for (i=0,nl-=4; i<nl; i+=4)
+ {
+ BN_ULONG t1,t2,t3,t4;
+
+ t1=nrp[i+0];
+ t2=nrp[i+1];
+ t3=nrp[i+2]; ap[i+0]=0;
+ t4=nrp[i+3]; ap[i+1]=0;
+ rp[i+0]=t1; ap[i+2]=0;
+ rp[i+1]=t2; ap[i+3]=0;
+ rp[i+2]=t3;
+ rp[i+3]=t4;
+ }
+ for (nl+=4; i<nl; i++)
+ rp[i]=nrp[i], ap[i]=0;
+ }
+ bn_correct_top(r);
+ bn_correct_top(ret);
+
+ return(1);
+ }
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+ BN_CTX *ctx) {
+ int retn = 0;
+ BIGNUM *t;
+
+ BN_CTX_start(ctx);
+ t = BN_CTX_get(ctx);
+ if (t == NULL) {
+ return 0;
+ }
+
+ if (BN_copy(t, a))
+ retn = BN_from_montgomery_word(ret, t, mont);
+ BN_CTX_end(ctx);
+
+ return retn;
+}
+
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+ const BN_MONT_CTX *mont, BN_CTX *ctx) {
+ BIGNUM *tmp;
+ int ret = 0;
+
+#if defined(OPENSSL_BN_ASM_MONT)
+ int num = mont->N.top;
+
+ if (num > 1 && a->top == num && b->top == num) {
+ if (bn_wexpand(r, num) == NULL) {
+ return 0;
+ }
+ if (bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
+ r->neg = a->neg ^ b->neg;
+ r->top = num;
+ bn_correct_top(r);
+ return 1;
+ }
+ }
+#endif
+
+ BN_CTX_start(ctx);
+ tmp = BN_CTX_get(ctx);
+ if (tmp == NULL) {
+ goto err;
+ }
+
+ if (a == b) {
+ if (!BN_sqr(tmp, a, ctx)) {
+ goto err;
+ }
+ } else {
+ if (!BN_mul(tmp, a, b, ctx)) {
+ goto err;
+ }
+ }
+
+ /* reduce from aRR to aR */
+ if (!BN_from_montgomery_word(r, tmp, mont)) {
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
diff --git a/src/crypto/bn/mul.c b/src/crypto/bn/mul.c
new file mode 100644
index 0000000..80c6288
--- /dev/null
+++ b/src/crypto/bn/mul.c
@@ -0,0 +1,886 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+
+
+void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb) {
+ BN_ULONG *rr;
+
+ if (na < nb) {
+ int itmp;
+ BN_ULONG *ltmp;
+
+ itmp = na;
+ na = nb;
+ nb = itmp;
+ ltmp = a;
+ a = b;
+ b = ltmp;
+ }
+ rr = &(r[na]);
+ if (nb <= 0) {
+ (void)bn_mul_words(r, a, na, 0);
+ return;
+ } else {
+ rr[0] = bn_mul_words(r, a, na, b[0]);
+ }
+
+ for (;;) {
+ if (--nb <= 0) {
+ return;
+ }
+ rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]);
+ if (--nb <= 0) {
+ return;
+ }
+ rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]);
+ if (--nb <= 0) {
+ return;
+ }
+ rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]);
+ if (--nb <= 0) {
+ return;
+ }
+ rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]);
+ rr += 4;
+ r += 4;
+ b += 4;
+ }
+}
+
+void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
+ bn_mul_words(r, a, n, b[0]);
+
+ for (;;) {
+ if (--n <= 0) {
+ return;
+ }
+ bn_mul_add_words(&(r[1]), a, n, b[1]);
+ if (--n <= 0) {
+ return;
+ }
+ bn_mul_add_words(&(r[2]), a, n, b[2]);
+ if (--n <= 0) {
+ return;
+ }
+ bn_mul_add_words(&(r[3]), a, n, b[3]);
+ if (--n <= 0) {
+ return;
+ }
+ bn_mul_add_words(&(r[4]), a, n, b[4]);
+ r += 4;
+ b += 4;
+ }
+}
+
+#if !defined(OPENSSL_X86) || defined(OPENSSL_NO_ASM)
+/* Here follows specialised variants of bn_add_words() and bn_sub_words(). They
+ * have the property performing operations on arrays of different sizes. The
+ * sizes of those arrays is expressed through cl, which is the common length (
+ * basicall, min(len(a),len(b)) ), and dl, which is the delta between the two
+ * lengths, calculated as len(a)-len(b). All lengths are the number of
+ * BN_ULONGs... For the operations that require a result array as parameter,
+ * it must have the length cl+abs(dl). These functions should probably end up
+ * in bn_asm.c as soon as there are assembler counterparts for the systems that
+ * use assembler files. */
+
+static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
+ const BN_ULONG *b, int cl, int dl) {
+ BN_ULONG c, t;
+
+ assert(cl >= 0);
+ c = bn_sub_words(r, a, b, cl);
+
+ if (dl == 0)
+ return c;
+
+ r += cl;
+ a += cl;
+ b += cl;
+
+ if (dl < 0) {
+ for (;;) {
+ t = b[0];
+ r[0] = (0 - t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 1;
+ }
+ if (++dl >= 0) {
+ break;
+ }
+
+ t = b[1];
+ r[1] = (0 - t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 1;
+ }
+ if (++dl >= 0) {
+ break;
+ }
+
+ t = b[2];
+ r[2] = (0 - t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 1;
+ }
+ if (++dl >= 0) {
+ break;
+ }
+
+ t = b[3];
+ r[3] = (0 - t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 1;
+ }
+ if (++dl >= 0) {
+ break;
+ }
+
+ b += 4;
+ r += 4;
+ }
+ } else {
+ int save_dl = dl;
+ while (c) {
+ t = a[0];
+ r[0] = (t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 0;
+ }
+ if (--dl <= 0) {
+ break;
+ }
+
+ t = a[1];
+ r[1] = (t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 0;
+ }
+ if (--dl <= 0) {
+ break;
+ }
+
+ t = a[2];
+ r[2] = (t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 0;
+ }
+ if (--dl <= 0) {
+ break;
+ }
+
+ t = a[3];
+ r[3] = (t - c) & BN_MASK2;
+ if (t != 0) {
+ c = 0;
+ }
+ if (--dl <= 0) {
+ break;
+ }
+
+ save_dl = dl;
+ a += 4;
+ r += 4;
+ }
+ if (dl > 0) {
+ if (save_dl > dl) {
+ switch (save_dl - dl) {
+ case 1:
+ r[1] = a[1];
+ if (--dl <= 0) {
+ break;
+ }
+ case 2:
+ r[2] = a[2];
+ if (--dl <= 0) {
+ break;
+ }
+ case 3:
+ r[3] = a[3];
+ if (--dl <= 0) {
+ break;
+ }
+ }
+ a += 4;
+ r += 4;
+ }
+ }
+
+ if (dl > 0) {
+ for (;;) {
+ r[0] = a[0];
+ if (--dl <= 0) {
+ break;
+ }
+ r[1] = a[1];
+ if (--dl <= 0) {
+ break;
+ }
+ r[2] = a[2];
+ if (--dl <= 0) {
+ break;
+ }
+ r[3] = a[3];
+ if (--dl <= 0) {
+ break;
+ }
+
+ a += 4;
+ r += 4;
+ }
+ }
+ }
+
+ return c;
+}
+#else
+/* On other platforms the function is defined in asm. */
+BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+ int cl, int dl);
+#endif
+
+/* Karatsuba recursive multiplication algorithm
+ * (cf. Knuth, The Art of Computer Programming, Vol. 2) */
+
+/* r is 2*n2 words in size,
+ * a and b are both n2 words in size.
+ * n2 must be a power of 2.
+ * We multiply and return the result.
+ * t must be 2*n2 words in size
+ * We calculate
+ * a[0]*b[0]
+ * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
+ * a[1]*b[1]
+ */
+/* dnX may not be positive, but n2/2+dnX has to be */
+static void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
+ int dna, int dnb, BN_ULONG *t) {
+ int n = n2 / 2, c1, c2;
+ int tna = n + dna, tnb = n + dnb;
+ unsigned int neg, zero;
+ BN_ULONG ln, lo, *p;
+
+ /* Only call bn_mul_comba 8 if n2 == 8 and the
+ * two arrays are complete [steve]
+ */
+ if (n2 == 8 && dna == 0 && dnb == 0) {
+ bn_mul_comba8(r, a, b);
+ return;
+ }
+
+ /* Else do normal multiply */
+ if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) {
+ bn_mul_normal(r, a, n2 + dna, b, n2 + dnb);
+ if ((dna + dnb) < 0)
+ memset(&r[2 * n2 + dna + dnb], 0, sizeof(BN_ULONG) * -(dna + dnb));
+ return;
+ }
+
+ /* r=(a[0]-a[1])*(b[1]-b[0]) */
+ c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
+ c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
+ zero = neg = 0;
+ switch (c1 * 3 + c2) {
+ case -4:
+ bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
+ bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+ break;
+ case -3:
+ zero = 1;
+ break;
+ case -2:
+ bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
+ bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
+ neg = 1;
+ break;
+ case -1:
+ case 0:
+ case 1:
+ zero = 1;
+ break;
+ case 2:
+ bn_sub_part_words(t, a, &(a[n]), tna, n - tna); /* + */
+ bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+ neg = 1;
+ break;
+ case 3:
+ zero = 1;
+ break;
+ case 4:
+ bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
+ bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
+ break;
+ }
+
+ if (n == 4 && dna == 0 && dnb == 0) {
+ /* XXX: bn_mul_comba4 could take extra args to do this well */
+ if (!zero) {
+ bn_mul_comba4(&(t[n2]), t, &(t[n]));
+ } else {
+ memset(&(t[n2]), 0, 8 * sizeof(BN_ULONG));
+ }
+
+ bn_mul_comba4(r, a, b);
+ bn_mul_comba4(&(r[n2]), &(a[n]), &(b[n]));
+ } else if (n == 8 && dna == 0 && dnb == 0) {
+ /* XXX: bn_mul_comba8 could take extra args to do this well */
+ if (!zero) {
+ bn_mul_comba8(&(t[n2]), t, &(t[n]));
+ } else {
+ memset(&(t[n2]), 0, 16 * sizeof(BN_ULONG));
+ }
+
+ bn_mul_comba8(r, a, b);
+ bn_mul_comba8(&(r[n2]), &(a[n]), &(b[n]));
+ } else {
+ p = &(t[n2 * 2]);
+ if (!zero) {
+ bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
+ } else {
+ memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
+ }
+ bn_mul_recursive(r, a, b, n, 0, 0, p);
+ bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), n, dna, dnb, p);
+ }
+
+ /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
+ * r[10] holds (a[0]*b[0])
+ * r[32] holds (b[1]*b[1]) */
+
+ c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+
+ if (neg) {
+ /* if t[32] is negative */
+ c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
+ } else {
+ /* Might have a carry */
+ c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
+ }
+
+ /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
+ * r[10] holds (a[0]*b[0])
+ * r[32] holds (b[1]*b[1])
+ * c1 holds the carry bits */
+ c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
+ if (c1) {
+ p = &(r[n + n2]);
+ lo = *p;
+ ln = (lo + c1) & BN_MASK2;
+ *p = ln;
+
+ /* The overflow will stop before we over write
+ * words we should not overwrite */
+ if (ln < (BN_ULONG)c1) {
+ do {
+ p++;
+ lo = *p;
+ ln = (lo + 1) & BN_MASK2;
+ *p = ln;
+ } while (ln == 0);
+ }
+ }
+}
+
+/* n+tn is the word length
+ * t needs to be n*4 is size, as does r */
+/* tnX may not be negative but less than n */
+static void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
+ int tna, int tnb, BN_ULONG *t) {
+ int i, j, n2 = n * 2;
+ int c1, c2, neg;
+ BN_ULONG ln, lo, *p;
+
+ if (n < 8) {
+ bn_mul_normal(r, a, n + tna, b, n + tnb);
+ return;
+ }
+
+ /* r=(a[0]-a[1])*(b[1]-b[0]) */
+ c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
+ c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
+ neg = 0;
+ switch (c1 * 3 + c2) {
+ case -4:
+ bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
+ bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+ break;
+ case -3:
+ /* break; */
+ case -2:
+ bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
+ bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
+ neg = 1;
+ break;
+ case -1:
+ case 0:
+ case 1:
+ /* break; */
+ case 2:
+ bn_sub_part_words(t, a, &(a[n]), tna, n - tna); /* + */
+ bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+ neg = 1;
+ break;
+ case 3:
+ /* break; */
+ case 4:
+ bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
+ bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
+ break;
+ }
+
+ if (n == 8) {
+ bn_mul_comba8(&(t[n2]), t, &(t[n]));
+ bn_mul_comba8(r, a, b);
+ bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
+ memset(&(r[n2 + tna + tnb]), 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
+ } else {
+ p = &(t[n2 * 2]);
+ bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
+ bn_mul_recursive(r, a, b, n, 0, 0, p);
+ i = n / 2;
+ /* If there is only a bottom half to the number,
+ * just do it */
+ if (tna > tnb) {
+ j = tna - i;
+ } else {
+ j = tnb - i;
+ }
+
+ if (j == 0) {
+ bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i, p);
+ memset(&(r[n2 + i * 2]), 0, sizeof(BN_ULONG) * (n2 - i * 2));
+ } else if (j > 0) {
+ /* eg, n == 16, i == 8 and tn == 11 */
+ bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i, p);
+ memset(&(r[n2 + tna + tnb]), 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
+ } else {
+ /* (j < 0) eg, n == 16, i == 8 and tn == 5 */
+ memset(&(r[n2]), 0, sizeof(BN_ULONG) * n2);
+ if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL &&
+ tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
+ bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
+ } else {
+ for (;;) {
+ i /= 2;
+ /* these simplified conditions work
+ * exclusively because difference
+ * between tna and tnb is 1 or 0 */
+ if (i < tna || i < tnb) {
+ bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i,
+ tnb - i, p);
+ break;
+ } else if (i == tna || i == tnb) {
+ bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i,
+ p);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
+ * r[10] holds (a[0]*b[0])
+ * r[32] holds (b[1]*b[1])
+ */
+
+ c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+
+ if (neg) {
+ /* if t[32] is negative */
+ c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
+ } else {
+ /* Might have a carry */
+ c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
+ }
+
+ /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
+ * r[10] holds (a[0]*b[0])
+ * r[32] holds (b[1]*b[1])
+ * c1 holds the carry bits */
+ c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
+ if (c1) {
+ p = &(r[n + n2]);
+ lo = *p;
+ ln = (lo + c1) & BN_MASK2;
+ *p = ln;
+
+ /* The overflow will stop before we over write
+ * words we should not overwrite */
+ if (ln < (BN_ULONG)c1) {
+ do {
+ p++;
+ lo = *p;
+ ln = (lo + 1) & BN_MASK2;
+ *p = ln;
+ } while (ln == 0);
+ }
+ }
+}
+
+int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+ int ret = 0;
+ int top, al, bl;
+ BIGNUM *rr;
+ int i;
+ BIGNUM *t = NULL;
+ int j = 0, k;
+
+ al = a->top;
+ bl = b->top;
+
+ if ((al == 0) || (bl == 0)) {
+ BN_zero(r);
+ return 1;
+ }
+ top = al + bl;
+
+ BN_CTX_start(ctx);
+ if ((r == a) || (r == b)) {
+ if ((rr = BN_CTX_get(ctx)) == NULL) {
+ goto err;
+ }
+ } else {
+ rr = r;
+ }
+ rr->neg = a->neg ^ b->neg;
+
+ i = al - bl;
+ if (i == 0) {
+ if (al == 8) {
+ if (bn_wexpand(rr, 16) == NULL) {
+ goto err;
+ }
+ rr->top = 16;
+ bn_mul_comba8(rr->d, a->d, b->d);
+ goto end;
+ }
+ }
+
+ if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL)) {
+ if (i >= -1 && i <= 1) {
+ /* Find out the power of two lower or equal
+ to the longest of the two numbers */
+ if (i >= 0) {
+ j = BN_num_bits_word((BN_ULONG)al);
+ }
+ if (i == -1) {
+ j = BN_num_bits_word((BN_ULONG)bl);
+ }
+ j = 1 << (j - 1);
+ assert(j <= al || j <= bl);
+ k = j + j;
+ t = BN_CTX_get(ctx);
+ if (t == NULL) {
+ goto err;
+ }
+ if (al > j || bl > j) {
+ if (bn_wexpand(t, k * 4) == NULL) {
+ goto err;
+ }
+ if (bn_wexpand(rr, k * 4) == NULL) {
+ goto err;
+ }
+ bn_mul_part_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
+ } else {
+ /* al <= j || bl <= j */
+ if (bn_wexpand(t, k * 2) == NULL) {
+ goto err;
+ }
+ if (bn_wexpand(rr, k * 2) == NULL) {
+ goto err;
+ }
+ bn_mul_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
+ }
+ rr->top = top;
+ goto end;
+ }
+ }
+
+ if (bn_wexpand(rr, top) == NULL) {
+ goto err;
+ }
+ rr->top = top;
+ bn_mul_normal(rr->d, a->d, al, b->d, bl);
+
+end:
+ bn_correct_top(rr);
+ if (r != rr) {
+ BN_copy(r, rr);
+ }
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+/* tmp must have 2*n words */
+static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp) {
+ int i, j, max;
+ const BN_ULONG *ap;
+ BN_ULONG *rp;
+
+ max = n * 2;
+ ap = a;
+ rp = r;
+ rp[0] = rp[max - 1] = 0;
+ rp++;
+ j = n;
+
+ if (--j > 0) {
+ ap++;
+ rp[j] = bn_mul_words(rp, ap, j, ap[-1]);
+ rp += 2;
+ }
+
+ for (i = n - 2; i > 0; i--) {
+ j--;
+ ap++;
+ rp[j] = bn_mul_add_words(rp, ap, j, ap[-1]);
+ rp += 2;
+ }
+
+ bn_add_words(r, r, r, max);
+
+ /* There will not be a carry */
+
+ bn_sqr_words(tmp, a, n);
+
+ bn_add_words(r, r, tmp, max);
+}
+
+/* r is 2*n words in size,
+ * a and b are both n words in size. (There's not actually a 'b' here ...)
+ * n must be a power of 2.
+ * We multiply and return the result.
+ * t must be 2*n words in size
+ * We calculate
+ * a[0]*b[0]
+ * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
+ * a[1]*b[1]
+ */
+static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t) {
+ int n = n2 / 2;
+ int zero, c1;
+ BN_ULONG ln, lo, *p;
+
+ if (n2 == 4) {
+ bn_sqr_comba4(r, a);
+ return;
+ } else if (n2 == 8) {
+ bn_sqr_comba8(r, a);
+ return;
+ }
+ if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) {
+ bn_sqr_normal(r, a, n2, t);
+ return;
+ }
+ /* r=(a[0]-a[1])*(a[1]-a[0]) */
+ c1 = bn_cmp_words(a, &(a[n]), n);
+ zero = 0;
+ if (c1 > 0) {
+ bn_sub_words(t, a, &(a[n]), n);
+ } else if (c1 < 0) {
+ bn_sub_words(t, &(a[n]), a, n);
+ } else {
+ zero = 1;
+ }
+
+ /* The result will always be negative unless it is zero */
+ p = &(t[n2 * 2]);
+
+ if (!zero) {
+ bn_sqr_recursive(&(t[n2]), t, n, p);
+ } else {
+ memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
+ }
+ bn_sqr_recursive(r, a, n, p);
+ bn_sqr_recursive(&(r[n2]), &(a[n]), n, p);
+
+ /* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero
+ * r[10] holds (a[0]*b[0])
+ * r[32] holds (b[1]*b[1]) */
+
+ c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+
+ /* t[32] is negative */
+ c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
+
+ /* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1])
+ * r[10] holds (a[0]*a[0])
+ * r[32] holds (a[1]*a[1])
+ * c1 holds the carry bits */
+ c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
+ if (c1) {
+ p = &(r[n + n2]);
+ lo = *p;
+ ln = (lo + c1) & BN_MASK2;
+ *p = ln;
+
+ /* The overflow will stop before we over write
+ * words we should not overwrite */
+ if (ln < (BN_ULONG)c1) {
+ do {
+ p++;
+ lo = *p;
+ ln = (lo + 1) & BN_MASK2;
+ *p = ln;
+ } while (ln == 0);
+ }
+ }
+}
+
+int BN_mul_word(BIGNUM *bn, BN_ULONG w) {
+ BN_ULONG ll;
+
+ w &= BN_MASK2;
+ if (!bn->top) {
+ return 1;
+ }
+
+ if (w == 0) {
+ BN_zero(bn);
+ return 1;
+ }
+
+ ll = bn_mul_words(bn->d, bn->d, bn->top, w);
+ if (ll) {
+ if (bn_wexpand(bn, bn->top + 1) == NULL) {
+ return 0;
+ }
+ bn->d[bn->top++] = ll;
+ }
+
+ return 1;
+}
+
+int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
+ int max, al;
+ int ret = 0;
+ BIGNUM *tmp, *rr;
+
+ al = a->top;
+ if (al <= 0) {
+ r->top = 0;
+ r->neg = 0;
+ return 1;
+ }
+
+ BN_CTX_start(ctx);
+ rr = (a != r) ? r : BN_CTX_get(ctx);
+ tmp = BN_CTX_get(ctx);
+ if (!rr || !tmp) {
+ goto err;
+ }
+
+ max = 2 * al; /* Non-zero (from above) */
+ if (bn_wexpand(rr, max) == NULL) {
+ goto err;
+ }
+
+ if (al == 4) {
+ bn_sqr_comba4(rr->d, a->d);
+ } else if (al == 8) {
+ bn_sqr_comba8(rr->d, a->d);
+ } else {
+ if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) {
+ BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL * 2];
+ bn_sqr_normal(rr->d, a->d, al, t);
+ } else {
+ int j, k;
+
+ j = BN_num_bits_word((BN_ULONG)al);
+ j = 1 << (j - 1);
+ k = j + j;
+ if (al == j) {
+ if (bn_wexpand(tmp, k * 2) == NULL) {
+ goto err;
+ }
+ bn_sqr_recursive(rr->d, a->d, al, tmp->d);
+ } else {
+ if (bn_wexpand(tmp, max) == NULL) {
+ goto err;
+ }
+ bn_sqr_normal(rr->d, a->d, al, tmp->d);
+ }
+ }
+ }
+
+ rr->neg = 0;
+ /* If the most-significant half of the top word of 'a' is zero, then
+ * the square of 'a' will max-1 words. */
+ if (a->d[al - 1] == (a->d[al - 1] & BN_MASK2l)) {
+ rr->top = max - 1;
+ } else {
+ rr->top = max;
+ }
+
+ if (rr != r) {
+ BN_copy(r, rr);
+ }
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
diff --git a/src/crypto/bn/prime.c b/src/crypto/bn/prime.c
new file mode 100644
index 0000000..fc9a3d5
--- /dev/null
+++ b/src/crypto/bn/prime.c
@@ -0,0 +1,838 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+/* number of Miller-Rabin iterations for an error rate of less than 2^-80
+ * for random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook
+ * of Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
+ * original paper: Damgaard, Landrock, Pomerance: Average case error estimates
+ * for the strong probable prime test. -- Math. Comp. 61 (1993) 177-194) */
+#define BN_prime_checks_for_size(b) ((b) >= 1300 ? 2 : \
+ (b) >= 850 ? 3 : \
+ (b) >= 650 ? 4 : \
+ (b) >= 550 ? 5 : \
+ (b) >= 450 ? 6 : \
+ (b) >= 400 ? 7 : \
+ (b) >= 350 ? 8 : \
+ (b) >= 300 ? 9 : \
+ (b) >= 250 ? 12 : \
+ (b) >= 200 ? 15 : \
+ (b) >= 150 ? 18 : \
+ /* b >= 100 */ 27)
+
+/* The quick sieve algorithm approach to weeding out primes is Philip
+ * Zimmermann's, as implemented in PGP. I have had a read of his comments and
+ * implemented my own version. */
+
+/* NUMPRIMES is the number of primes that fit into a uint16_t. */
+#define NUMPRIMES 2048
+
+/* primes contains all the primes that fit into a uint16_t. */
+static const uint16_t primes[NUMPRIMES] = {
+ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31,
+ 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79,
+ 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
+ 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193,
+ 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257,
+ 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317,
+ 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389,
+ 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457,
+ 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523,
+ 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601,
+ 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661,
+ 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743,
+ 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823,
+ 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887,
+ 907, 911, 919, 929, 937, 941, 947, 953, 967, 971, 977,
+ 983, 991, 997, 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
+ 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 1109, 1117,
+ 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213,
+ 1217, 1223, 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289,
+ 1291, 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373,
+ 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451, 1453,
+ 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, 1523, 1531,
+ 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601, 1607,
+ 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693,
+ 1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777,
+ 1783, 1787, 1789, 1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871,
+ 1873, 1877, 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951,
+ 1973, 1979, 1987, 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029,
+ 2039, 2053, 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113,
+ 2129, 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213,
+ 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287, 2293,
+ 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377,
+ 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423, 2437, 2441, 2447,
+ 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543, 2549, 2551,
+ 2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657, 2659,
+ 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711, 2713,
+ 2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797,
+ 2801, 2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887,
+ 2897, 2903, 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971,
+ 2999, 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
+ 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181, 3187,
+ 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257, 3259, 3271,
+ 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347, 3359,
+ 3361, 3371, 3373, 3389, 3391, 3407, 3413, 3433, 3449, 3457, 3461,
+ 3463, 3467, 3469, 3491, 3499, 3511, 3517, 3527, 3529, 3533, 3539,
+ 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607, 3613, 3617,
+ 3623, 3631, 3637, 3643, 3659, 3671, 3673, 3677, 3691, 3697, 3701,
+ 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
+ 3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889,
+ 3907, 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989,
+ 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073,
+ 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157,
+ 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231, 4241, 4243, 4253,
+ 4259, 4261, 4271, 4273, 4283, 4289, 4297, 4327, 4337, 4339, 4349,
+ 4357, 4363, 4373, 4391, 4397, 4409, 4421, 4423, 4441, 4447, 4451,
+ 4457, 4463, 4481, 4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547,
+ 4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621, 4637, 4639, 4643,
+ 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729,
+ 4733, 4751, 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, 4817,
+ 4831, 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
+ 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003, 5009,
+ 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087, 5099, 5101,
+ 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179, 5189, 5197, 5209,
+ 5227, 5231, 5233, 5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309,
+ 5323, 5333, 5347, 5351, 5381, 5387, 5393, 5399, 5407, 5413, 5417,
+ 5419, 5431, 5437, 5441, 5443, 5449, 5471, 5477, 5479, 5483, 5501,
+ 5503, 5507, 5519, 5521, 5527, 5531, 5557, 5563, 5569, 5573, 5581,
+ 5591, 5623, 5639, 5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683,
+ 5689, 5693, 5701, 5711, 5717, 5737, 5741, 5743, 5749, 5779, 5783,
+ 5791, 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857,
+ 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939, 5953,
+ 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073,
+ 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133, 6143, 6151, 6163,
+ 6173, 6197, 6199, 6203, 6211, 6217, 6221, 6229, 6247, 6257, 6263,
+ 6269, 6271, 6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329, 6337,
+ 6343, 6353, 6359, 6361, 6367, 6373, 6379, 6389, 6397, 6421, 6427,
+ 6449, 6451, 6469, 6473, 6481, 6491, 6521, 6529, 6547, 6551, 6553,
+ 6563, 6569, 6571, 6577, 6581, 6599, 6607, 6619, 6637, 6653, 6659,
+ 6661, 6673, 6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737,
+ 6761, 6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
+ 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911, 6917, 6947,
+ 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997, 7001, 7013,
+ 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121, 7127,
+ 7129, 7151, 7159, 7177, 7187, 7193, 7207, 7211, 7213, 7219, 7229,
+ 7237, 7243, 7247, 7253, 7283, 7297, 7307, 7309, 7321, 7331, 7333,
+ 7349, 7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457, 7459, 7477,
+ 7481, 7487, 7489, 7499, 7507, 7517, 7523, 7529, 7537, 7541, 7547,
+ 7549, 7559, 7561, 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621,
+ 7639, 7643, 7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717,
+ 7723, 7727, 7741, 7753, 7757, 7759, 7789, 7793, 7817, 7823, 7829,
+ 7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919, 7927,
+ 7933, 7937, 7949, 7951, 7963, 7993, 8009, 8011, 8017, 8039, 8053,
+ 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111, 8117, 8123, 8147,
+ 8161, 8167, 8171, 8179, 8191, 8209, 8219, 8221, 8231, 8233, 8237,
+ 8243, 8263, 8269, 8273, 8287, 8291, 8293, 8297, 8311, 8317, 8329,
+ 8353, 8363, 8369, 8377, 8387, 8389, 8419, 8423, 8429, 8431, 8443,
+ 8447, 8461, 8467, 8501, 8513, 8521, 8527, 8537, 8539, 8543, 8563,
+ 8573, 8581, 8597, 8599, 8609, 8623, 8627, 8629, 8641, 8647, 8663,
+ 8669, 8677, 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731, 8737,
+ 8741, 8747, 8753, 8761, 8779, 8783, 8803, 8807, 8819, 8821, 8831,
+ 8837, 8839, 8849, 8861, 8863, 8867, 8887, 8893, 8923, 8929, 8933,
+ 8941, 8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011, 9013, 9029,
+ 9041, 9043, 9049, 9059, 9067, 9091, 9103, 9109, 9127, 9133, 9137,
+ 9151, 9157, 9161, 9173, 9181, 9187, 9199, 9203, 9209, 9221, 9227,
+ 9239, 9241, 9257, 9277, 9281, 9283, 9293, 9311, 9319, 9323, 9337,
+ 9341, 9343, 9349, 9371, 9377, 9391, 9397, 9403, 9413, 9419, 9421,
+ 9431, 9433, 9437, 9439, 9461, 9463, 9467, 9473, 9479, 9491, 9497,
+ 9511, 9521, 9533, 9539, 9547, 9551, 9587, 9601, 9613, 9619, 9623,
+ 9629, 9631, 9643, 9649, 9661, 9677, 9679, 9689, 9697, 9719, 9721,
+ 9733, 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791, 9803, 9811,
+ 9817, 9829, 9833, 9839, 9851, 9857, 9859, 9871, 9883, 9887, 9901,
+ 9907, 9923, 9929, 9931, 9941, 9949, 9967, 9973, 10007, 10009, 10037,
+ 10039, 10061, 10067, 10069, 10079, 10091, 10093, 10099, 10103, 10111, 10133,
+ 10139, 10141, 10151, 10159, 10163, 10169, 10177, 10181, 10193, 10211, 10223,
+ 10243, 10247, 10253, 10259, 10267, 10271, 10273, 10289, 10301, 10303, 10313,
+ 10321, 10331, 10333, 10337, 10343, 10357, 10369, 10391, 10399, 10427, 10429,
+ 10433, 10453, 10457, 10459, 10463, 10477, 10487, 10499, 10501, 10513, 10529,
+ 10531, 10559, 10567, 10589, 10597, 10601, 10607, 10613, 10627, 10631, 10639,
+ 10651, 10657, 10663, 10667, 10687, 10691, 10709, 10711, 10723, 10729, 10733,
+ 10739, 10753, 10771, 10781, 10789, 10799, 10831, 10837, 10847, 10853, 10859,
+ 10861, 10867, 10883, 10889, 10891, 10903, 10909, 10937, 10939, 10949, 10957,
+ 10973, 10979, 10987, 10993, 11003, 11027, 11047, 11057, 11059, 11069, 11071,
+ 11083, 11087, 11093, 11113, 11117, 11119, 11131, 11149, 11159, 11161, 11171,
+ 11173, 11177, 11197, 11213, 11239, 11243, 11251, 11257, 11261, 11273, 11279,
+ 11287, 11299, 11311, 11317, 11321, 11329, 11351, 11353, 11369, 11383, 11393,
+ 11399, 11411, 11423, 11437, 11443, 11447, 11467, 11471, 11483, 11489, 11491,
+ 11497, 11503, 11519, 11527, 11549, 11551, 11579, 11587, 11593, 11597, 11617,
+ 11621, 11633, 11657, 11677, 11681, 11689, 11699, 11701, 11717, 11719, 11731,
+ 11743, 11777, 11779, 11783, 11789, 11801, 11807, 11813, 11821, 11827, 11831,
+ 11833, 11839, 11863, 11867, 11887, 11897, 11903, 11909, 11923, 11927, 11933,
+ 11939, 11941, 11953, 11959, 11969, 11971, 11981, 11987, 12007, 12011, 12037,
+ 12041, 12043, 12049, 12071, 12073, 12097, 12101, 12107, 12109, 12113, 12119,
+ 12143, 12149, 12157, 12161, 12163, 12197, 12203, 12211, 12227, 12239, 12241,
+ 12251, 12253, 12263, 12269, 12277, 12281, 12289, 12301, 12323, 12329, 12343,
+ 12347, 12373, 12377, 12379, 12391, 12401, 12409, 12413, 12421, 12433, 12437,
+ 12451, 12457, 12473, 12479, 12487, 12491, 12497, 12503, 12511, 12517, 12527,
+ 12539, 12541, 12547, 12553, 12569, 12577, 12583, 12589, 12601, 12611, 12613,
+ 12619, 12637, 12641, 12647, 12653, 12659, 12671, 12689, 12697, 12703, 12713,
+ 12721, 12739, 12743, 12757, 12763, 12781, 12791, 12799, 12809, 12821, 12823,
+ 12829, 12841, 12853, 12889, 12893, 12899, 12907, 12911, 12917, 12919, 12923,
+ 12941, 12953, 12959, 12967, 12973, 12979, 12983, 13001, 13003, 13007, 13009,
+ 13033, 13037, 13043, 13049, 13063, 13093, 13099, 13103, 13109, 13121, 13127,
+ 13147, 13151, 13159, 13163, 13171, 13177, 13183, 13187, 13217, 13219, 13229,
+ 13241, 13249, 13259, 13267, 13291, 13297, 13309, 13313, 13327, 13331, 13337,
+ 13339, 13367, 13381, 13397, 13399, 13411, 13417, 13421, 13441, 13451, 13457,
+ 13463, 13469, 13477, 13487, 13499, 13513, 13523, 13537, 13553, 13567, 13577,
+ 13591, 13597, 13613, 13619, 13627, 13633, 13649, 13669, 13679, 13681, 13687,
+ 13691, 13693, 13697, 13709, 13711, 13721, 13723, 13729, 13751, 13757, 13759,
+ 13763, 13781, 13789, 13799, 13807, 13829, 13831, 13841, 13859, 13873, 13877,
+ 13879, 13883, 13901, 13903, 13907, 13913, 13921, 13931, 13933, 13963, 13967,
+ 13997, 13999, 14009, 14011, 14029, 14033, 14051, 14057, 14071, 14081, 14083,
+ 14087, 14107, 14143, 14149, 14153, 14159, 14173, 14177, 14197, 14207, 14221,
+ 14243, 14249, 14251, 14281, 14293, 14303, 14321, 14323, 14327, 14341, 14347,
+ 14369, 14387, 14389, 14401, 14407, 14411, 14419, 14423, 14431, 14437, 14447,
+ 14449, 14461, 14479, 14489, 14503, 14519, 14533, 14537, 14543, 14549, 14551,
+ 14557, 14561, 14563, 14591, 14593, 14621, 14627, 14629, 14633, 14639, 14653,
+ 14657, 14669, 14683, 14699, 14713, 14717, 14723, 14731, 14737, 14741, 14747,
+ 14753, 14759, 14767, 14771, 14779, 14783, 14797, 14813, 14821, 14827, 14831,
+ 14843, 14851, 14867, 14869, 14879, 14887, 14891, 14897, 14923, 14929, 14939,
+ 14947, 14951, 14957, 14969, 14983, 15013, 15017, 15031, 15053, 15061, 15073,
+ 15077, 15083, 15091, 15101, 15107, 15121, 15131, 15137, 15139, 15149, 15161,
+ 15173, 15187, 15193, 15199, 15217, 15227, 15233, 15241, 15259, 15263, 15269,
+ 15271, 15277, 15287, 15289, 15299, 15307, 15313, 15319, 15329, 15331, 15349,
+ 15359, 15361, 15373, 15377, 15383, 15391, 15401, 15413, 15427, 15439, 15443,
+ 15451, 15461, 15467, 15473, 15493, 15497, 15511, 15527, 15541, 15551, 15559,
+ 15569, 15581, 15583, 15601, 15607, 15619, 15629, 15641, 15643, 15647, 15649,
+ 15661, 15667, 15671, 15679, 15683, 15727, 15731, 15733, 15737, 15739, 15749,
+ 15761, 15767, 15773, 15787, 15791, 15797, 15803, 15809, 15817, 15823, 15859,
+ 15877, 15881, 15887, 15889, 15901, 15907, 15913, 15919, 15923, 15937, 15959,
+ 15971, 15973, 15991, 16001, 16007, 16033, 16057, 16061, 16063, 16067, 16069,
+ 16073, 16087, 16091, 16097, 16103, 16111, 16127, 16139, 16141, 16183, 16187,
+ 16189, 16193, 16217, 16223, 16229, 16231, 16249, 16253, 16267, 16273, 16301,
+ 16319, 16333, 16339, 16349, 16361, 16363, 16369, 16381, 16411, 16417, 16421,
+ 16427, 16433, 16447, 16451, 16453, 16477, 16481, 16487, 16493, 16519, 16529,
+ 16547, 16553, 16561, 16567, 16573, 16603, 16607, 16619, 16631, 16633, 16649,
+ 16651, 16657, 16661, 16673, 16691, 16693, 16699, 16703, 16729, 16741, 16747,
+ 16759, 16763, 16787, 16811, 16823, 16829, 16831, 16843, 16871, 16879, 16883,
+ 16889, 16901, 16903, 16921, 16927, 16931, 16937, 16943, 16963, 16979, 16981,
+ 16987, 16993, 17011, 17021, 17027, 17029, 17033, 17041, 17047, 17053, 17077,
+ 17093, 17099, 17107, 17117, 17123, 17137, 17159, 17167, 17183, 17189, 17191,
+ 17203, 17207, 17209, 17231, 17239, 17257, 17291, 17293, 17299, 17317, 17321,
+ 17327, 17333, 17341, 17351, 17359, 17377, 17383, 17387, 17389, 17393, 17401,
+ 17417, 17419, 17431, 17443, 17449, 17467, 17471, 17477, 17483, 17489, 17491,
+ 17497, 17509, 17519, 17539, 17551, 17569, 17573, 17579, 17581, 17597, 17599,
+ 17609, 17623, 17627, 17657, 17659, 17669, 17681, 17683, 17707, 17713, 17729,
+ 17737, 17747, 17749, 17761, 17783, 17789, 17791, 17807, 17827, 17837, 17839,
+ 17851, 17863,
+};
+
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+ const BIGNUM *a1_odd, int k, BN_CTX *ctx, BN_MONT_CTX *mont);
+static int probable_prime(BIGNUM *rnd, int bits);
+static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
+ const BIGNUM *rem, BN_CTX *ctx);
+static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add,
+ const BIGNUM *rem, BN_CTX *ctx);
+
+void BN_GENCB_set(BN_GENCB *callback,
+ int (*f)(int event, int n, struct bn_gencb_st *),
+ void *arg) {
+ callback->callback = f;
+ callback->arg = arg;
+}
+
+int BN_GENCB_call(BN_GENCB *callback, int event, int n) {
+ if (!callback) {
+ return 1;
+ }
+
+ return callback->callback(event, n, callback);
+}
+
+int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
+ const BIGNUM *rem, BN_GENCB *cb) {
+ BIGNUM *t;
+ int found = 0;
+ int i, j, c1 = 0;
+ BN_CTX *ctx;
+ int checks = BN_prime_checks_for_size(bits);
+
+ if (bits < 2) {
+ /* There are no prime numbers this small. */
+ OPENSSL_PUT_ERROR(BN, BN_generate_prime_ex, BN_R_BITS_TOO_SMALL);
+ return 0;
+ } else if (bits == 2 && safe) {
+ /* The smallest safe prime (7) is three bits. */
+ OPENSSL_PUT_ERROR(BN, BN_generate_prime_ex, BN_R_BITS_TOO_SMALL);
+ return 0;
+ }
+
+ ctx = BN_CTX_new();
+ if (ctx == NULL) {
+ goto err;
+ }
+ BN_CTX_start(ctx);
+ t = BN_CTX_get(ctx);
+ if (!t) {
+ goto err;
+ }
+
+loop:
+ /* make a random number and set the top and bottom bits */
+ if (add == NULL) {
+ if (!probable_prime(ret, bits)) {
+ goto err;
+ }
+ } else {
+ if (safe) {
+ if (!probable_prime_dh_safe(ret, bits, add, rem, ctx)) {
+ goto err;
+ }
+ } else {
+ if (!probable_prime_dh(ret, bits, add, rem, ctx)) {
+ goto err;
+ }
+ }
+ }
+
+ if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, c1++)) {
+ /* aborted */
+ goto err;
+ }
+
+ if (!safe) {
+ i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
+ if (i == -1) {
+ goto err;
+ } else if (i == 0) {
+ goto loop;
+ }
+ } else {
+ /* for "safe prime" generation, check that (p-1)/2 is prime. Since a prime
+ * is odd, We just need to divide by 2 */
+ if (!BN_rshift1(t, ret)) {
+ goto err;
+ }
+
+ for (i = 0; i < checks; i++) {
+ j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, NULL);
+ if (j == -1) {
+ goto err;
+ } else if (j == 0) {
+ goto loop;
+ }
+
+ j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, NULL);
+ if (j == -1) {
+ goto err;
+ } else if (j == 0) {
+ goto loop;
+ }
+
+ if (!BN_GENCB_call(cb, i, c1 - 1)) {
+ goto err;
+ }
+ /* We have a safe prime test pass */
+ }
+ }
+
+ /* we have a prime :-) */
+ found = 1;
+
+err:
+ if (ctx != NULL) {
+ BN_CTX_end(ctx);
+ BN_CTX_free(ctx);
+ }
+
+ return found;
+}
+
+int BN_primality_test(int *is_probably_prime, const BIGNUM *candidate,
+ int checks, BN_CTX *ctx, int do_trial_division,
+ BN_GENCB *cb) {
+ switch (BN_is_prime_fasttest_ex(candidate, checks, ctx, do_trial_division, cb)) {
+ case 1:
+ *is_probably_prime = 1;
+ return 1;
+ case 0:
+ *is_probably_prime = 0;
+ return 1;
+ default:
+ *is_probably_prime = 0;
+ return 0;
+ }
+}
+
+int BN_is_prime_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx, BN_GENCB *cb) {
+ return BN_is_prime_fasttest_ex(candidate, checks, ctx, 0, cb);
+}
+
+int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
+ int do_trial_division, BN_GENCB *cb) {
+ int i, j, ret = -1;
+ int k;
+ BN_CTX *ctx = NULL;
+ BIGNUM *A1, *A1_odd, *check; /* taken from ctx */
+ BN_MONT_CTX *mont = NULL;
+ const BIGNUM *A = NULL;
+
+ if (BN_cmp(a, BN_value_one()) <= 0) {
+ return 0;
+ }
+
+ if (checks == BN_prime_checks) {
+ checks = BN_prime_checks_for_size(BN_num_bits(a));
+ }
+
+ /* first look for small factors */
+ if (!BN_is_odd(a)) {
+ /* a is even => a is prime if and only if a == 2 */
+ return BN_is_word(a, 2);
+ }
+
+ if (do_trial_division) {
+ for (i = 1; i < NUMPRIMES; i++) {
+ if (BN_mod_word(a, primes[i]) == 0) {
+ return 0;
+ }
+ }
+
+ if (!BN_GENCB_call(cb, 1, -1)) {
+ goto err;
+ }
+ }
+
+ if (ctx_passed != NULL) {
+ ctx = ctx_passed;
+ } else if ((ctx = BN_CTX_new()) == NULL) {
+ goto err;
+ }
+ BN_CTX_start(ctx);
+
+ /* A := abs(a) */
+ if (a->neg) {
+ BIGNUM *t;
+ if ((t = BN_CTX_get(ctx)) == NULL) {
+ goto err;
+ }
+ BN_copy(t, a);
+ t->neg = 0;
+ A = t;
+ } else {
+ A = a;
+ }
+
+ A1 = BN_CTX_get(ctx);
+ A1_odd = BN_CTX_get(ctx);
+ check = BN_CTX_get(ctx);
+ if (check == NULL) {
+ goto err;
+ }
+
+ /* compute A1 := A - 1 */
+ if (!BN_copy(A1, A)) {
+ goto err;
+ }
+ if (!BN_sub_word(A1, 1)) {
+ goto err;
+ }
+ if (BN_is_zero(A1)) {
+ ret = 0;
+ goto err;
+ }
+
+ /* write A1 as A1_odd * 2^k */
+ k = 1;
+ while (!BN_is_bit_set(A1, k)) {
+ k++;
+ }
+ if (!BN_rshift(A1_odd, A1, k)) {
+ goto err;
+ }
+
+ /* Montgomery setup for computations mod A */
+ mont = BN_MONT_CTX_new();
+ if (mont == NULL) {
+ goto err;
+ }
+ if (!BN_MONT_CTX_set(mont, A, ctx)) {
+ goto err;
+ }
+
+ for (i = 0; i < checks; i++) {
+ if (!BN_pseudo_rand_range(check, A1)) {
+ goto err;
+ }
+ if (!BN_add_word(check, 1)) {
+ goto err;
+ }
+ /* now 1 <= check < A */
+
+ j = witness(check, A, A1, A1_odd, k, ctx, mont);
+ if (j == -1) {
+ goto err;
+ }
+ if (j) {
+ ret = 0;
+ goto err;
+ }
+ if (!BN_GENCB_call(cb, 1, i)) {
+ goto err;
+ }
+ }
+ ret = 1;
+
+err:
+ if (ctx != NULL) {
+ BN_CTX_end(ctx);
+ if (ctx_passed == NULL) {
+ BN_CTX_free(ctx);
+ }
+ }
+ if (mont != NULL) {
+ BN_MONT_CTX_free(mont);
+ }
+
+ return ret;
+}
+
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+ const BIGNUM *a1_odd, int k, BN_CTX *ctx,
+ BN_MONT_CTX *mont) {
+ if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont)) { /* w := w^a1_odd mod a */
+ return -1;
+ }
+ if (BN_is_one(w)) {
+ return 0; /* probably prime */
+ }
+ if (BN_cmp(w, a1) == 0) {
+ return 0; /* w == -1 (mod a), 'a' is probably prime */
+ }
+
+ while (--k) {
+ if (!BN_mod_mul(w, w, w, a, ctx)) { /* w := w^2 mod a */
+ return -1;
+ }
+
+ if (BN_is_one(w)) {
+ return 1; /* 'a' is composite, otherwise a previous 'w' would
+ * have been == -1 (mod 'a') */
+ }
+
+ if (BN_cmp(w, a1) == 0) {
+ return 0; /* w == -1 (mod a), 'a' is probably prime */
+ }
+ }
+
+ /* If we get here, 'w' is the (a-1)/2-th power of the original 'w',
+ * and it is neither -1 nor +1 -- so 'a' cannot be prime */
+ return 1;
+}
+
+static BN_ULONG get_word(const BIGNUM *bn) {
+ if (bn->top == 1) {
+ return bn->d[0];
+ }
+ return 0;
+}
+
+static int probable_prime(BIGNUM *rnd, int bits) {
+ int i;
+ uint16_t mods[NUMPRIMES];
+ BN_ULONG delta;
+ BN_ULONG maxdelta = BN_MASK2 - primes[NUMPRIMES - 1];
+ char is_single_word = bits <= BN_BITS2;
+
+again:
+ if (!BN_rand(rnd, bits, 1, 1)) {
+ return 0;
+ }
+
+ /* we now have a random number 'rnd' to test. */
+ for (i = 1; i < NUMPRIMES; i++) {
+ mods[i] = (uint16_t)BN_mod_word(rnd, (BN_ULONG)primes[i]);
+ }
+ /* If bits is so small that it fits into a single word then we
+ * additionally don't want to exceed that many bits. */
+ if (is_single_word) {
+ BN_ULONG size_limit = (((BN_ULONG)1) << bits) - get_word(rnd) - 1;
+ if (size_limit < maxdelta) {
+ maxdelta = size_limit;
+ }
+ }
+ delta = 0;
+
+loop:
+ if (is_single_word) {
+ BN_ULONG rnd_word = get_word(rnd);
+
+ /* In the case that the candidate prime is a single word then
+ * we check that:
+ * 1) It's greater than primes[i] because we shouldn't reject
+ * 3 as being a prime number because it's a multiple of
+ * three.
+ * 2) That it's not a multiple of a known prime. We don't
+ * check that rnd-1 is also coprime to all the known
+ * primes because there aren't many small primes where
+ * that's true. */
+ for (i = 1; i < NUMPRIMES && primes[i] < rnd_word; i++) {
+ if ((mods[i] + delta) % primes[i] == 0) {
+ delta += 2;
+ if (delta > maxdelta)
+ goto again;
+ goto loop;
+ }
+ }
+ } else {
+ for (i = 1; i < NUMPRIMES; i++) {
+ /* check that rnd is not a prime and also
+ * that gcd(rnd-1,primes) == 1 (except for 2) */
+ if (((mods[i] + delta) % primes[i]) <= 1) {
+ delta += 2;
+ if (delta > maxdelta)
+ goto again;
+ goto loop;
+ }
+ }
+ }
+
+ if (!BN_add_word(rnd, delta)) {
+ return 0;
+ }
+ if (BN_num_bits(rnd) != bits) {
+ goto again;
+ }
+
+ return 1;
+}
+
+static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
+ const BIGNUM *rem, BN_CTX *ctx) {
+ int i, ret = 0;
+ BIGNUM *t1;
+
+ BN_CTX_start(ctx);
+ if ((t1 = BN_CTX_get(ctx)) == NULL) {
+ goto err;
+ }
+
+ if (!BN_rand(rnd, bits, 0, 1)) {
+ goto err;
+ }
+
+ /* we need ((rnd-rem) % add) == 0 */
+
+ if (!BN_mod(t1, rnd, add, ctx)) {
+ goto err;
+ }
+ if (!BN_sub(rnd, rnd, t1)) {
+ goto err;
+ }
+ if (rem == NULL) {
+ if (!BN_add_word(rnd, 1)) {
+ goto err;
+ }
+ } else {
+ if (!BN_add(rnd, rnd, rem)) {
+ goto err;
+ }
+ }
+ /* we now have a random number 'rand' to test. */
+
+loop:
+ for (i = 1; i < NUMPRIMES; i++) {
+ /* check that rnd is a prime */
+ if (BN_mod_word(rnd, (BN_ULONG)primes[i]) <= 1) {
+ if (!BN_add(rnd, rnd, add)) {
+ goto err;
+ }
+ goto loop;
+ }
+ }
+
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
+ const BIGNUM *rem, BN_CTX *ctx) {
+ int i, ret = 0;
+ BIGNUM *t1, *qadd, *q;
+
+ bits--;
+ BN_CTX_start(ctx);
+ t1 = BN_CTX_get(ctx);
+ q = BN_CTX_get(ctx);
+ qadd = BN_CTX_get(ctx);
+ if (qadd == NULL) {
+ goto err;
+ }
+
+ if (!BN_rshift1(qadd, padd)) {
+ goto err;
+ }
+
+ if (!BN_rand(q, bits, 0, 1)) {
+ goto err;
+ }
+
+ /* we need ((rnd-rem) % add) == 0 */
+ if (!BN_mod(t1, q, qadd, ctx)) {
+ goto err;
+ }
+
+ if (!BN_sub(q, q, t1)) {
+ goto err;
+ }
+
+ if (rem == NULL) {
+ if (!BN_add_word(q, 1)) {
+ goto err;
+ }
+ } else {
+ if (!BN_rshift1(t1, rem)) {
+ goto err;
+ }
+ if (!BN_add(q, q, t1)) {
+ goto err;
+ }
+ }
+
+ /* we now have a random number 'rand' to test. */
+ if (!BN_lshift1(p, q)) {
+ goto err;
+ }
+ if (!BN_add_word(p, 1)) {
+ goto err;
+ }
+
+loop:
+ for (i = 1; i < NUMPRIMES; i++) {
+ /* check that p and q are prime */
+ /* check that for p and q
+ * gcd(p-1,primes) == 1 (except for 2) */
+ if ((BN_mod_word(p, (BN_ULONG)primes[i]) == 0) ||
+ (BN_mod_word(q, (BN_ULONG)primes[i]) == 0)) {
+ if (!BN_add(p, p, padd)) {
+ goto err;
+ }
+ if (!BN_add(q, q, qadd)) {
+ goto err;
+ }
+ goto loop;
+ }
+ }
+
+ ret = 1;
+
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
diff --git a/src/crypto/bn/random.c b/src/crypto/bn/random.c
new file mode 100644
index 0000000..285bf26
--- /dev/null
+++ b/src/crypto/bn/random.c
@@ -0,0 +1,328 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+#include <openssl/rand.h>
+#include <openssl/sha.h>
+
+int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) {
+ uint8_t *buf = NULL;
+ int ret = 0, bit, bytes, mask;
+
+ if (rnd == NULL) {
+ return 0;
+ }
+
+ if (bits == 0) {
+ BN_zero(rnd);
+ return 1;
+ }
+
+ bytes = (bits + 7) / 8;
+ bit = (bits - 1) % 8;
+ mask = 0xff << (bit + 1);
+
+ buf = OPENSSL_malloc(bytes);
+ if (buf == NULL) {
+ OPENSSL_PUT_ERROR(BN, BN_rand, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ /* Make a random number and set the top and bottom bits. */
+ if (!RAND_bytes(buf, bytes)) {
+ goto err;
+ }
+
+ if (top != -1) {
+ if (top) {
+ if (bit == 0) {
+ buf[0] = 1;
+ buf[1] |= 0x80;
+ } else {
+ buf[0] |= (3 << (bit - 1));
+ }
+ } else {
+ buf[0] |= (1 << bit);
+ }
+ }
+
+ buf[0] &= ~mask;
+
+ /* set bottom bit if requested */
+ if (bottom) {
+ buf[bytes - 1] |= 1;
+ }
+
+ if (!BN_bin2bn(buf, bytes, rnd)) {
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ if (buf != NULL) {
+ OPENSSL_cleanse(buf, bytes);
+ OPENSSL_free(buf);
+ }
+ return (ret);
+}
+
+int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom) {
+ return BN_rand(rnd, bits, top, bottom);
+}
+
+int BN_rand_range(BIGNUM *r, const BIGNUM *range) {
+ unsigned n;
+ unsigned count = 100;
+
+ if (range->neg || BN_is_zero(range)) {
+ OPENSSL_PUT_ERROR(BN, BN_rand_range, BN_R_INVALID_RANGE);
+ return 0;
+ }
+
+ n = BN_num_bits(range); /* n > 0 */
+
+ /* BN_is_bit_set(range, n - 1) always holds */
+ if (n == 1) {
+ BN_zero(r);
+ } else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3)) {
+ /* range = 100..._2,
+ * so 3*range (= 11..._2) is exactly one bit longer than range */
+ do {
+ if (!BN_rand(r, n + 1, -1 /* don't set most significant bits */,
+ 0 /* don't set least significant bits */)) {
+ return 0;
+ }
+
+ /* If r < 3*range, use r := r MOD range (which is either r, r - range, or
+ * r - 2*range). Otherwise, iterate again. Since 3*range = 11..._2, each
+ * iteration succeeds with probability >= .75. */
+ if (BN_cmp(r, range) >= 0) {
+ if (!BN_sub(r, r, range)) {
+ return 0;
+ }
+ if (BN_cmp(r, range) >= 0) {
+ if (!BN_sub(r, r, range)) {
+ return 0;
+ }
+ }
+ }
+
+ if (!--count) {
+ OPENSSL_PUT_ERROR(BN, BN_rand_range, BN_R_TOO_MANY_ITERATIONS);
+ return 0;
+ }
+ } while (BN_cmp(r, range) >= 0);
+ } else {
+ do {
+ /* range = 11..._2 or range = 101..._2 */
+ if (!BN_rand(r, n, -1, 0)) {
+ return 0;
+ }
+
+ if (!--count) {
+ OPENSSL_PUT_ERROR(BN, BN_rand_range, BN_R_TOO_MANY_ITERATIONS);
+ return 0;
+ }
+ } while (BN_cmp(r, range) >= 0);
+ }
+
+ return 1;
+}
+
+int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range) {
+ return BN_rand_range(r, range);
+}
+
+int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range, const BIGNUM *priv,
+ const uint8_t *message, size_t message_len,
+ BN_CTX *ctx) {
+ SHA512_CTX sha;
+ /* We use 512 bits of random data per iteration to
+ * ensure that we have at least |range| bits of randomness. */
+ uint8_t random_bytes[64];
+ uint8_t digest[SHA512_DIGEST_LENGTH];
+ size_t done, todo, attempt;
+ const unsigned num_k_bytes = BN_num_bytes(range);
+ const unsigned bits_to_mask = (8 - (BN_num_bits(range) % 8)) % 8;
+ uint8_t private_bytes[96];
+ uint8_t *k_bytes = NULL;
+ int ret = 0;
+
+ if (out == NULL) {
+ return 0;
+ }
+
+ if (BN_is_zero(range)) {
+ OPENSSL_PUT_ERROR(BN, BN_generate_dsa_nonce, BN_R_DIV_BY_ZERO);
+ goto err;
+ }
+
+ k_bytes = OPENSSL_malloc(num_k_bytes);
+ if (!k_bytes) {
+ OPENSSL_PUT_ERROR(BN, BN_generate_dsa_nonce, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ /* We copy |priv| into a local buffer to avoid furthur exposing its
+ * length. */
+ todo = sizeof(priv->d[0]) * priv->top;
+ if (todo > sizeof(private_bytes)) {
+ /* No reasonable DSA or ECDSA key should have a private key
+ * this large and we don't handle this case in order to avoid
+ * leaking the length of the private key. */
+ OPENSSL_PUT_ERROR(BN, BN_generate_dsa_nonce, BN_R_PRIVATE_KEY_TOO_LARGE);
+ goto err;
+ }
+ memcpy(private_bytes, priv->d, todo);
+ memset(private_bytes + todo, 0, sizeof(private_bytes) - todo);
+
+ for (attempt = 0;; attempt++) {
+ for (done = 0; done < num_k_bytes;) {
+ if (!RAND_bytes(random_bytes, sizeof(random_bytes))) {
+ goto err;
+ }
+ SHA512_Init(&sha);
+ SHA512_Update(&sha, &attempt, sizeof(attempt));
+ SHA512_Update(&sha, &done, sizeof(done));
+ SHA512_Update(&sha, private_bytes, sizeof(private_bytes));
+ SHA512_Update(&sha, message, message_len);
+ SHA512_Update(&sha, random_bytes, sizeof(random_bytes));
+ SHA512_Final(digest, &sha);
+
+ todo = num_k_bytes - done;
+ if (todo > SHA512_DIGEST_LENGTH) {
+ todo = SHA512_DIGEST_LENGTH;
+ }
+ memcpy(k_bytes + done, digest, todo);
+ done += todo;
+ }
+
+ k_bytes[0] &= 0xff >> bits_to_mask;
+
+ if (!BN_bin2bn(k_bytes, num_k_bytes, out)) {
+ goto err;
+ }
+ if (BN_cmp(out, range) < 0) {
+ break;
+ }
+ }
+
+ ret = 1;
+
+err:
+ if (k_bytes) {
+ OPENSSL_free(k_bytes);
+ }
+ return ret;
+}
diff --git a/src/crypto/bn/rsaz_exp.c b/src/crypto/bn/rsaz_exp.c
new file mode 100644
index 0000000..c802752
--- /dev/null
+++ b/src/crypto/bn/rsaz_exp.c
@@ -0,0 +1,326 @@
+/*****************************************************************************
+* *
+* Copyright (c) 2012, Intel Corporation *
+* *
+* All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without *
+* modification, are permitted provided that the following conditions are *
+* met: *
+* *
+* * Redistributions of source code must retain the above copyright *
+* notice, this list of conditions and the following disclaimer. *
+* *
+* * Redistributions in binary form must reproduce the above copyright *
+* notice, this list of conditions and the following disclaimer in the *
+* documentation and/or other materials provided with the *
+* distribution. *
+* *
+* * Neither the name of the Intel Corporation nor the names of its *
+* contributors may be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* *
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY *
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR *
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+******************************************************************************
+* Developers and authors: *
+* Shay Gueron (1, 2), and Vlad Krasnov (1) *
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel *
+* (2) University of Haifa, Israel *
+*****************************************************************************/
+
+#include <openssl/base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+
+#include "rsaz_exp.h"
+
+#include <openssl/mem.h>
+
+/*
+ * See crypto/bn/asm/rsaz-avx2.pl for further details.
+ */
+void rsaz_1024_norm2red_avx2(void *red,const void *norm);
+void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,BN_ULONG k);
+void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,BN_ULONG k,int cnt);
+void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
+void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
+void rsaz_1024_red2norm_avx2(void *norm,const void *red);
+
+#if defined(__GNUC__)
+# define ALIGN64 __attribute__((aligned(64)))
+#elif defined(_MSC_VER)
+# define ALIGN64 __declspec(align(64))
+#elif defined(__SUNPRO_C)
+# define ALIGN64
+# pragma align 64(one,two80)
+#else
+# define ALIGN64 /* not fatal, might hurt performance a little */
+#endif
+
+ALIGN64 static const BN_ULONG one[40] =
+ {1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ALIGN64 static const BN_ULONG two80[40] =
+ {0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
+ const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
+ const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
+{
+ unsigned char storage[320*3+32*9*16+64]; /* 5.5KB */
+ unsigned char *p_str = storage + (64-((size_t)storage%64));
+ unsigned char *a_inv, *m, *result,
+ *table_s = p_str+320*3,
+ *R2 = table_s; /* borrow */
+ int index;
+ int wvalue;
+
+ if ((((size_t)p_str&4095)+320)>>12) {
+ result = p_str;
+ a_inv = p_str + 320;
+ m = p_str + 320*2; /* should not cross page */
+ } else {
+ m = p_str; /* should not cross page */
+ result = p_str + 320;
+ a_inv = p_str + 320*2;
+ }
+
+ rsaz_1024_norm2red_avx2(m, m_norm);
+ rsaz_1024_norm2red_avx2(a_inv, base_norm);
+ rsaz_1024_norm2red_avx2(R2, RR);
+
+ rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
+ rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
+
+ /* table[0] = 1 */
+ rsaz_1024_mul_avx2(result, R2, one, m, k0);
+ /* table[1] = a_inv^1 */
+ rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
+
+ rsaz_1024_scatter5_avx2(table_s,result,0);
+ rsaz_1024_scatter5_avx2(table_s,a_inv,1);
+
+ /* table[2] = a_inv^2 */
+ rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,2);
+#if 0
+ /* this is almost 2x smaller and less than 1% slower */
+ for (index=3; index<32; index++) {
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,index);
+ }
+#else
+ /* table[4] = a_inv^4 */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,4);
+ /* table[8] = a_inv^8 */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,8);
+ /* table[16] = a_inv^16 */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,16);
+ /* table[17] = a_inv^17 */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,17);
+
+ /* table[3] */
+ rsaz_1024_gather5_avx2(result,table_s,2);
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+ rsaz_1024_scatter5_avx2(table_s,result,3);
+ /* table[6] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,6);
+ /* table[12] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,12);
+ /* table[24] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,24);
+ /* table[25] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,25);
+
+ /* table[5] */
+ rsaz_1024_gather5_avx2(result,table_s,4);
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+ rsaz_1024_scatter5_avx2(table_s,result,5);
+ /* table[10] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,10);
+ /* table[20] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,20);
+ /* table[21] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,21);
+
+ /* table[7] */
+ rsaz_1024_gather5_avx2(result,table_s,6);
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+ rsaz_1024_scatter5_avx2(table_s,result,7);
+ /* table[14] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,14);
+ /* table[28] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,28);
+ /* table[29] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,29);
+
+ /* table[9] */
+ rsaz_1024_gather5_avx2(result,table_s,8);
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+ rsaz_1024_scatter5_avx2(table_s,result,9);
+ /* table[18] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,18);
+ /* table[19] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,19);
+
+ /* table[11] */
+ rsaz_1024_gather5_avx2(result,table_s,10);
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+ rsaz_1024_scatter5_avx2(table_s,result,11);
+ /* table[22] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,22);
+ /* table[23] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,23);
+
+ /* table[13] */
+ rsaz_1024_gather5_avx2(result,table_s,12);
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+ rsaz_1024_scatter5_avx2(table_s,result,13);
+ /* table[26] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,26);
+ /* table[27] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,27);
+
+ /* table[15] */
+ rsaz_1024_gather5_avx2(result,table_s,14);
+ rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+ rsaz_1024_scatter5_avx2(table_s,result,15);
+ /* table[30] */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ rsaz_1024_scatter5_avx2(table_s,result,30);
+ /* table[31] */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ rsaz_1024_scatter5_avx2(table_s,result,31);
+#endif
+
+ /* load first window */
+ p_str = (unsigned char*)exponent;
+ wvalue = p_str[127] >> 3;
+ rsaz_1024_gather5_avx2(result,table_s,wvalue);
+
+ index = 1014;
+
+ while(index > -1) { /* loop for the remaining 127 windows */
+
+ rsaz_1024_sqr_avx2(result, result, m, k0, 5);
+
+ wvalue = *((unsigned short*)&p_str[index/8]);
+ wvalue = (wvalue>> (index%8)) & 31;
+ index-=5;
+
+ rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+ }
+
+ /* square four times */
+ rsaz_1024_sqr_avx2(result, result, m, k0, 4);
+
+ wvalue = p_str[0] & 15;
+
+ rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */
+ rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
+ /* from Montgomery */
+ rsaz_1024_mul_avx2(result, result, one, m, k0);
+
+ rsaz_1024_red2norm_avx2(result_norm, result);
+
+ OPENSSL_cleanse(storage,sizeof(storage));
+}
+
+/*
+ * See crypto/bn/rsaz-x86_64.pl for further details.
+ */
+void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,BN_ULONG k);
+void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,BN_ULONG k,const void *tbl,unsigned int power);
+void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,BN_ULONG k,unsigned int power);
+void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,BN_ULONG k);
+void rsaz_512_sqr(void *ret,const void *a,const void *n,BN_ULONG k,int cnt);
+void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
+void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
+
+void RSAZ_512_mod_exp(BN_ULONG result[8],
+ const BN_ULONG base[8], const BN_ULONG exponent[8],
+ const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
+{
+ unsigned char storage[16*8*8+64*2+64]; /* 1.2KB */
+ unsigned char *table = storage + (64-((size_t)storage%64));
+ BN_ULONG *a_inv = (BN_ULONG *)(table+16*8*8),
+ *temp = (BN_ULONG *)(table+16*8*8+8*8);
+ unsigned char *p_str = (unsigned char*)exponent;
+ int index;
+ unsigned int wvalue;
+
+ /* table[0] = 1_inv */
+ temp[0] = 0-m[0]; temp[1] = ~m[1];
+ temp[2] = ~m[2]; temp[3] = ~m[3];
+ temp[4] = ~m[4]; temp[5] = ~m[5];
+ temp[6] = ~m[6]; temp[7] = ~m[7];
+ rsaz_512_scatter4(table, temp, 0);
+
+ /* table [1] = a_inv^1 */
+ rsaz_512_mul(a_inv, base, RR, m, k0);
+ rsaz_512_scatter4(table, a_inv, 1);
+
+ /* table [2] = a_inv^2 */
+ rsaz_512_sqr(temp, a_inv, m, k0, 1);
+ rsaz_512_scatter4(table, temp, 2);
+
+ for (index=3; index<16; index++)
+ rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
+
+ /* load first window */
+ wvalue = p_str[63];
+
+ rsaz_512_gather4(temp, table, wvalue>>4);
+ rsaz_512_sqr(temp, temp, m, k0, 4);
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);
+
+ for (index=62; index>=0; index--) {
+ wvalue = p_str[index];
+
+ rsaz_512_sqr(temp, temp, m, k0, 4);
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);
+
+ rsaz_512_sqr(temp, temp, m, k0, 4);
+ rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);
+ }
+
+ /* from Montgomery */
+ rsaz_512_mul_by_one(result, temp, m, k0);
+
+ OPENSSL_cleanse(storage,sizeof(storage));
+}
+
+#endif /* OPENSSL_X86_64 */
diff --git a/src/crypto/bn/rsaz_exp.h b/src/crypto/bn/rsaz_exp.h
new file mode 100644
index 0000000..0bb6b0c
--- /dev/null
+++ b/src/crypto/bn/rsaz_exp.h
@@ -0,0 +1,44 @@
+/******************************************************************************
+* Copyright(c) 2012, Intel Corp.
+* Developers and authors:
+* Shay Gueron (1, 2), and Vlad Krasnov (1)
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel
+* (2) University of Haifa, Israel
+******************************************************************************
+* LICENSE:
+* This submission to OpenSSL is to be made available under the OpenSSL
+* license, and only to the OpenSSL project, in order to allow integration
+* into the publicly distributed code.
+* The use of this code, or portions of this code, or concepts embedded in
+* this code, or modification of this code and/or algorithm(s) in it, or the
+* use of this code for any other purpose than stated above, requires special
+* licensing.
+******************************************************************************
+* DISCLAIMER:
+* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS
+* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
+* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************/
+
+#ifndef RSAZ_EXP_H
+#define RSAZ_EXP_H
+
+#include <openssl/bn.h>
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16],
+ const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
+ const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0);
+int rsaz_avx2_eligible(void);
+
+void RSAZ_512_mod_exp(BN_ULONG result[8],
+ const BN_ULONG base_norm[8], const BN_ULONG exponent[8],
+ const BN_ULONG m_norm[8], BN_ULONG k0, const BN_ULONG RR[8]);
+#endif
diff --git a/src/crypto/bn/shift.c b/src/crypto/bn/shift.c
new file mode 100644
index 0000000..1e3b7c3
--- /dev/null
+++ b/src/crypto/bn/shift.c
@@ -0,0 +1,287 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <string.h>
+
+#include "internal.h"
+
+
+int BN_lshift(BIGNUM *r, const BIGNUM *a, int n) {
+ int i, nw, lb, rb;
+ BN_ULONG *t, *f;
+ BN_ULONG l;
+
+ r->neg = a->neg;
+ nw = n / BN_BITS2;
+ if (bn_wexpand(r, a->top + nw + 1) == NULL) {
+ return 0;
+ }
+ lb = n % BN_BITS2;
+ rb = BN_BITS2 - lb;
+ f = a->d;
+ t = r->d;
+ t[a->top + nw] = 0;
+ if (lb == 0) {
+ for (i = a->top - 1; i >= 0; i--) {
+ t[nw + i] = f[i];
+ }
+ } else {
+ for (i = a->top - 1; i >= 0; i--) {
+ l = f[i];
+ t[nw + i + 1] |= (l >> rb) & BN_MASK2;
+ t[nw + i] = (l << lb) & BN_MASK2;
+ }
+ }
+ memset(t, 0, nw * sizeof(t[0]));
+ r->top = a->top + nw + 1;
+ bn_correct_top(r);
+
+ return 1;
+}
+
+int BN_lshift1(BIGNUM *r, const BIGNUM *a) {
+ BN_ULONG *ap, *rp, t, c;
+ int i;
+
+ if (r != a) {
+ r->neg = a->neg;
+ if (bn_wexpand(r, a->top + 1) == NULL) {
+ return 0;
+ }
+ r->top = a->top;
+ } else {
+ if (bn_wexpand(r, a->top + 1) == NULL) {
+ return 0;
+ }
+ }
+ ap = a->d;
+ rp = r->d;
+ c = 0;
+ for (i = 0; i < a->top; i++) {
+ t = *(ap++);
+ *(rp++) = ((t << 1) | c) & BN_MASK2;
+ c = (t & BN_TBIT) ? 1 : 0;
+ }
+ if (c) {
+ *rp = 1;
+ r->top++;
+ }
+
+ return 1;
+}
+
+int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) {
+ int i, j, nw, lb, rb;
+ BN_ULONG *t, *f;
+ BN_ULONG l, tmp;
+
+ nw = n / BN_BITS2;
+ rb = n % BN_BITS2;
+ lb = BN_BITS2 - rb;
+ if (nw >= a->top || a->top == 0) {
+ BN_zero(r);
+ return 1;
+ }
+ i = (BN_num_bits(a) - n + (BN_BITS2 - 1)) / BN_BITS2;
+ if (r != a) {
+ r->neg = a->neg;
+ if (bn_wexpand(r, i) == NULL) {
+ return 0;
+ }
+ } else {
+ if (n == 0) {
+ return 1; /* or the copying loop will go berserk */
+ }
+ }
+
+ f = &(a->d[nw]);
+ t = r->d;
+ j = a->top - nw;
+ r->top = i;
+
+ if (rb == 0) {
+ for (i = j; i != 0; i--) {
+ *(t++) = *(f++);
+ }
+ } else {
+ l = *(f++);
+ for (i = j - 1; i != 0; i--) {
+ tmp = (l >> rb) & BN_MASK2;
+ l = *(f++);
+ *(t++) = (tmp | (l << lb)) & BN_MASK2;
+ }
+ if ((l = (l >> rb) & BN_MASK2)) {
+ *(t) = l;
+ }
+ }
+
+ return 1;
+}
+
+int BN_rshift1(BIGNUM *r, const BIGNUM *a) {
+ BN_ULONG *ap, *rp, t, c;
+ int i, j;
+
+ if (BN_is_zero(a)) {
+ BN_zero(r);
+ return 1;
+ }
+ i = a->top;
+ ap = a->d;
+ j = i - (ap[i - 1] == 1);
+ if (a != r) {
+ if (bn_wexpand(r, j) == NULL) {
+ return 0;
+ }
+ r->neg = a->neg;
+ }
+ rp = r->d;
+ t = ap[--i];
+ c = (t & 1) ? BN_TBIT : 0;
+ if (t >>= 1) {
+ rp[i] = t;
+ }
+ while (i > 0) {
+ t = ap[--i];
+ rp[i] = ((t >> 1) & BN_MASK2) | c;
+ c = (t & 1) ? BN_TBIT : 0;
+ }
+ r->top = j;
+
+ return 1;
+}
+
+int BN_set_bit(BIGNUM *a, int n) {
+ int i, j, k;
+
+ if (n < 0) {
+ return 0;
+ }
+
+ i = n / BN_BITS2;
+ j = n % BN_BITS2;
+ if (a->top <= i) {
+ if (bn_wexpand(a, i + 1) == NULL) {
+ return 0;
+ }
+ for (k = a->top; k < i + 1; k++) {
+ a->d[k] = 0;
+ }
+ a->top = i + 1;
+ }
+
+ a->d[i] |= (((BN_ULONG)1) << j);
+
+ return 1;
+}
+
+int BN_clear_bit(BIGNUM *a, int n) {
+ int i, j;
+
+ if (n < 0) {
+ return 0;
+ }
+
+ i = n / BN_BITS2;
+ j = n % BN_BITS2;
+ if (a->top <= i) {
+ return 0;
+ }
+
+ a->d[i] &= (~(((BN_ULONG)1) << j));
+ bn_correct_top(a);
+ return 1;
+}
+
+int BN_is_bit_set(const BIGNUM *a, int n) {
+ int i, j;
+
+ if (n < 0) {
+ return 0;
+ }
+ i = n / BN_BITS2;
+ j = n % BN_BITS2;
+ if (a->top <= i) {
+ return 0;
+ }
+
+ return (a->d[i]>>j)&1;
+}
+
+int BN_mask_bits(BIGNUM *a, int n) {
+ int b, w;
+
+ if (n < 0) {
+ return 0;
+ }
+
+ w = n / BN_BITS2;
+ b = n % BN_BITS2;
+ if (w >= a->top) {
+ return 0;
+ }
+ if (b == 0) {
+ a->top = w;
+ } else {
+ a->top = w + 1;
+ a->d[w] &= ~(BN_MASK2 << b);
+ }
+
+ bn_correct_top(a);
+ return 1;
+}
diff --git a/src/crypto/bn/sqrt.c b/src/crypto/bn/sqrt.c
new file mode 100644
index 0000000..07041f9
--- /dev/null
+++ b/src/crypto/bn/sqrt.c
@@ -0,0 +1,505 @@
+/* Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * and Bodo Moeller for the OpenSSL project. */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+
+
+/* Returns 'ret' such that
+ * ret^2 == a (mod p),
+ * using the Tonelli/Shanks algorithm (cf. Henri Cohen, "A Course
+ * in Algebraic Computational Number Theory", algorithm 1.5.1).
+ * 'p' must be prime! */
+BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
+ BIGNUM *ret = in;
+ int err = 1;
+ int r;
+ BIGNUM *A, *b, *q, *t, *x, *y;
+ int e, i, j;
+
+ if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) {
+ if (BN_abs_is_word(p, 2)) {
+ if (ret == NULL) {
+ ret = BN_new();
+ }
+ if (ret == NULL) {
+ goto end;
+ }
+ if (!BN_set_word(ret, BN_is_bit_set(a, 0))) {
+ if (ret != in) {
+ BN_free(ret);
+ }
+ return NULL;
+ }
+ return ret;
+ }
+
+ OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_P_IS_NOT_PRIME);
+ return (NULL);
+ }
+
+ if (BN_is_zero(a) || BN_is_one(a)) {
+ if (ret == NULL) {
+ ret = BN_new();
+ }
+ if (ret == NULL) {
+ goto end;
+ }
+ if (!BN_set_word(ret, BN_is_one(a))) {
+ if (ret != in) {
+ BN_free(ret);
+ }
+ return NULL;
+ }
+ return ret;
+ }
+
+ BN_CTX_start(ctx);
+ A = BN_CTX_get(ctx);
+ b = BN_CTX_get(ctx);
+ q = BN_CTX_get(ctx);
+ t = BN_CTX_get(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ if (y == NULL) {
+ goto end;
+ }
+
+ if (ret == NULL) {
+ ret = BN_new();
+ }
+ if (ret == NULL) {
+ goto end;
+ }
+
+ /* A = a mod p */
+ if (!BN_nnmod(A, a, p, ctx)) {
+ goto end;
+ }
+
+ /* now write |p| - 1 as 2^e*q where q is odd */
+ e = 1;
+ while (!BN_is_bit_set(p, e)) {
+ e++;
+ }
+ /* we'll set q later (if needed) */
+
+ if (e == 1) {
+ /* The easy case: (|p|-1)/2 is odd, so 2 has an inverse
+ * modulo (|p|-1)/2, and square roots can be computed
+ * directly by modular exponentiation.
+ * We have
+ * 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2),
+ * so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1.
+ */
+ if (!BN_rshift(q, p, 2)) {
+ goto end;
+ }
+ q->neg = 0;
+ if (!BN_add_word(q, 1) ||
+ !BN_mod_exp(ret, A, q, p, ctx)) {
+ goto end;
+ }
+ err = 0;
+ goto vrfy;
+ }
+
+ if (e == 2) {
+ /* |p| == 5 (mod 8)
+ *
+ * In this case 2 is always a non-square since
+ * Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime.
+ * So if a really is a square, then 2*a is a non-square.
+ * Thus for
+ * b := (2*a)^((|p|-5)/8),
+ * i := (2*a)*b^2
+ * we have
+ * i^2 = (2*a)^((1 + (|p|-5)/4)*2)
+ * = (2*a)^((p-1)/2)
+ * = -1;
+ * so if we set
+ * x := a*b*(i-1),
+ * then
+ * x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
+ * = a^2 * b^2 * (-2*i)
+ * = a*(-i)*(2*a*b^2)
+ * = a*(-i)*i
+ * = a.
+ *
+ * (This is due to A.O.L. Atkin,
+ * <URL:
+ *http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
+ * November 1992.)
+ */
+
+ /* t := 2*a */
+ if (!BN_mod_lshift1_quick(t, A, p)) {
+ goto end;
+ }
+
+ /* b := (2*a)^((|p|-5)/8) */
+ if (!BN_rshift(q, p, 3)) {
+ goto end;
+ }
+ q->neg = 0;
+ if (!BN_mod_exp(b, t, q, p, ctx)) {
+ goto end;
+ }
+
+ /* y := b^2 */
+ if (!BN_mod_sqr(y, b, p, ctx)) {
+ goto end;
+ }
+
+ /* t := (2*a)*b^2 - 1*/
+ if (!BN_mod_mul(t, t, y, p, ctx) ||
+ !BN_sub_word(t, 1)) {
+ goto end;
+ }
+
+ /* x = a*b*t */
+ if (!BN_mod_mul(x, A, b, p, ctx) ||
+ !BN_mod_mul(x, x, t, p, ctx)) {
+ goto end;
+ }
+
+ if (!BN_copy(ret, x)) {
+ goto end;
+ }
+ err = 0;
+ goto vrfy;
+ }
+
+ /* e > 2, so we really have to use the Tonelli/Shanks algorithm.
+ * First, find some y that is not a square. */
+ if (!BN_copy(q, p)) {
+ goto end; /* use 'q' as temp */
+ }
+ q->neg = 0;
+ i = 2;
+ do {
+ /* For efficiency, try small numbers first;
+ * if this fails, try random numbers.
+ */
+ if (i < 22) {
+ if (!BN_set_word(y, i)) {
+ goto end;
+ }
+ } else {
+ if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0)) {
+ goto end;
+ }
+ if (BN_ucmp(y, p) >= 0) {
+ if (!(p->neg ? BN_add : BN_sub)(y, y, p)) {
+ goto end;
+ }
+ }
+ /* now 0 <= y < |p| */
+ if (BN_is_zero(y)) {
+ if (!BN_set_word(y, i)) {
+ goto end;
+ }
+ }
+ }
+
+ r = BN_kronecker(y, q, ctx); /* here 'q' is |p| */
+ if (r < -1) {
+ goto end;
+ }
+ if (r == 0) {
+ /* m divides p */
+ OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_P_IS_NOT_PRIME);
+ goto end;
+ }
+ } while (r == 1 && ++i < 82);
+
+ if (r != -1) {
+ /* Many rounds and still no non-square -- this is more likely
+ * a bug than just bad luck.
+ * Even if p is not prime, we should have found some y
+ * such that r == -1.
+ */
+ OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_TOO_MANY_ITERATIONS);
+ goto end;
+ }
+
+ /* Here's our actual 'q': */
+ if (!BN_rshift(q, q, e)) {
+ goto end;
+ }
+
+ /* Now that we have some non-square, we can find an element
+ * of order 2^e by computing its q'th power. */
+ if (!BN_mod_exp(y, y, q, p, ctx)) {
+ goto end;
+ }
+ if (BN_is_one(y)) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_P_IS_NOT_PRIME);
+ goto end;
+ }
+
+ /* Now we know that (if p is indeed prime) there is an integer
+ * k, 0 <= k < 2^e, such that
+ *
+ * a^q * y^k == 1 (mod p).
+ *
+ * As a^q is a square and y is not, k must be even.
+ * q+1 is even, too, so there is an element
+ *
+ * X := a^((q+1)/2) * y^(k/2),
+ *
+ * and it satisfies
+ *
+ * X^2 = a^q * a * y^k
+ * = a,
+ *
+ * so it is the square root that we are looking for.
+ */
+
+ /* t := (q-1)/2 (note that q is odd) */
+ if (!BN_rshift1(t, q)) {
+ goto end;
+ }
+
+ /* x := a^((q-1)/2) */
+ if (BN_is_zero(t)) /* special case: p = 2^e + 1 */
+ {
+ if (!BN_nnmod(t, A, p, ctx)) {
+ goto end;
+ }
+ if (BN_is_zero(t)) {
+ /* special case: a == 0 (mod p) */
+ BN_zero(ret);
+ err = 0;
+ goto end;
+ } else if (!BN_one(x)) {
+ goto end;
+ }
+ } else {
+ if (!BN_mod_exp(x, A, t, p, ctx)) {
+ goto end;
+ }
+ if (BN_is_zero(x)) {
+ /* special case: a == 0 (mod p) */
+ BN_zero(ret);
+ err = 0;
+ goto end;
+ }
+ }
+
+ /* b := a*x^2 (= a^q) */
+ if (!BN_mod_sqr(b, x, p, ctx) ||
+ !BN_mod_mul(b, b, A, p, ctx)) {
+ goto end;
+ }
+
+ /* x := a*x (= a^((q+1)/2)) */
+ if (!BN_mod_mul(x, x, A, p, ctx)) {
+ goto end;
+ }
+
+ while (1) {
+ /* Now b is a^q * y^k for some even k (0 <= k < 2^E
+ * where E refers to the original value of e, which we
+ * don't keep in a variable), and x is a^((q+1)/2) * y^(k/2).
+ *
+ * We have a*b = x^2,
+ * y^2^(e-1) = -1,
+ * b^2^(e-1) = 1.
+ */
+
+ if (BN_is_one(b)) {
+ if (!BN_copy(ret, x)) {
+ goto end;
+ }
+ err = 0;
+ goto vrfy;
+ }
+
+
+ /* find smallest i such that b^(2^i) = 1 */
+ i = 1;
+ if (!BN_mod_sqr(t, b, p, ctx)) {
+ goto end;
+ }
+ while (!BN_is_one(t)) {
+ i++;
+ if (i == e) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_NOT_A_SQUARE);
+ goto end;
+ }
+ if (!BN_mod_mul(t, t, t, p, ctx)) {
+ goto end;
+ }
+ }
+
+
+ /* t := y^2^(e - i - 1) */
+ if (!BN_copy(t, y)) {
+ goto end;
+ }
+ for (j = e - i - 1; j > 0; j--) {
+ if (!BN_mod_sqr(t, t, p, ctx)) {
+ goto end;
+ }
+ }
+ if (!BN_mod_mul(y, t, t, p, ctx) ||
+ !BN_mod_mul(x, x, t, p, ctx) ||
+ !BN_mod_mul(b, b, y, p, ctx)) {
+ goto end;
+ }
+ e = i;
+ }
+
+vrfy:
+ if (!err) {
+ /* verify the result -- the input might have been not a square
+ * (test added in 0.9.8) */
+
+ if (!BN_mod_sqr(x, ret, p, ctx)) {
+ err = 1;
+ }
+
+ if (!err && 0 != BN_cmp(x, A)) {
+ OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_NOT_A_SQUARE);
+ err = 1;
+ }
+ }
+
+end:
+ if (err) {
+ if (ret != NULL && ret != in) {
+ BN_clear_free(ret);
+ }
+ ret = NULL;
+ }
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+int BN_sqrt(BIGNUM *out_sqrt, const BIGNUM *in, BN_CTX *ctx) {
+ BIGNUM *estimate, *tmp, *delta, *last_delta, *tmp2;
+ int ok = 0, last_delta_valid = 0;
+
+ if (in->neg) {
+ OPENSSL_PUT_ERROR(BN, BN_sqrt, BN_R_NEGATIVE_NUMBER);
+ return 0;
+ }
+ if (BN_is_zero(in)) {
+ BN_zero(out_sqrt);
+ return 1;
+ }
+
+ BN_CTX_start(ctx);
+ if (out_sqrt == in) {
+ estimate = BN_CTX_get(ctx);
+ } else {
+ estimate = out_sqrt;
+ }
+ tmp = BN_CTX_get(ctx);
+ last_delta = BN_CTX_get(ctx);
+ delta = BN_CTX_get(ctx);
+ if (estimate == NULL || tmp == NULL || last_delta == NULL || delta == NULL) {
+ OPENSSL_PUT_ERROR(BN, BN_sqrt, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ /* We estimate that the square root of an n-bit number is 2^{n/2}. */
+ BN_lshift(estimate, BN_value_one(), BN_num_bits(in)/2);
+
+ /* This is Newton's method for finding a root of the equation |estimate|^2 -
+ * |in| = 0. */
+ for (;;) {
+ /* |estimate| = 1/2 * (|estimate| + |in|/|estimate|) */
+ if (!BN_div(tmp, NULL, in, estimate, ctx) ||
+ !BN_add(tmp, tmp, estimate) ||
+ !BN_rshift1(estimate, tmp) ||
+ /* |tmp| = |estimate|^2 */
+ !BN_sqr(tmp, estimate, ctx) ||
+ /* |delta| = |in| - |tmp| */
+ !BN_sub(delta, in, tmp)) {
+ OPENSSL_PUT_ERROR(BN, BN_sqrt, ERR_R_BN_LIB);
+ goto err;
+ }
+
+ delta->neg = 0;
+ /* The difference between |in| and |estimate| squared is required to always
+ * decrease. This ensures that the loop always terminates, but I don't have
+ * a proof that it always finds the square root for a given square. */
+ if (last_delta_valid && BN_cmp(delta, last_delta) >= 0) {
+ break;
+ }
+
+ last_delta_valid = 1;
+
+ tmp2 = last_delta;
+ last_delta = delta;
+ delta = tmp2;
+ }
+
+ if (BN_cmp(tmp, in) != 0) {
+ OPENSSL_PUT_ERROR(BN, BN_sqrt, BN_R_NOT_A_SQUARE);
+ goto err;
+ }
+
+ ok = 1;
+
+err:
+ if (ok && out_sqrt == in) {
+ BN_copy(out_sqrt, estimate);
+ }
+ BN_CTX_end(ctx);
+ return ok;
+}