diff options
Diffstat (limited to 'linux-aarch64')
-rw-r--r-- | linux-aarch64/crypto/aes/aesv8-armx.S | 520 | ||||
-rw-r--r-- | linux-aarch64/crypto/modes/ghashv8-armx.S | 251 | ||||
-rw-r--r-- | linux-aarch64/crypto/sha/sha1-armv8.S | 664 | ||||
-rw-r--r-- | linux-aarch64/crypto/sha/sha256-armv8.S | 292 | ||||
-rw-r--r-- | linux-aarch64/crypto/sha/sha512-armv8.S | 96 |
5 files changed, 982 insertions, 841 deletions
diff --git a/linux-aarch64/crypto/aes/aesv8-armx.S b/linux-aarch64/crypto/aes/aesv8-armx.S index e7ae46f..9c63291 100644 --- a/linux-aarch64/crypto/aes/aesv8-armx.S +++ b/linux-aarch64/crypto/aes/aesv8-armx.S @@ -6,7 +6,7 @@ .arch armv8-a+crypto #endif .align 5 -rcon: +.Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b @@ -31,7 +31,7 @@ aes_v8_set_encrypt_key: tst w1,#0x3f b.ne .Lenc_key_abort - adr x3,rcon + adr x3,.Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b @@ -55,7 +55,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b @@ -72,7 +72,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b @@ -86,7 +86,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2] @@ -117,7 +117,7 @@ aes_v8_set_encrypt_key: dup v5.4s,v3.s[3] eor v5.16b,v5.16b,v4.16b - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b ext v4.16b,v0.16b,v4.16b,#12 shl v1.16b,v1.16b,#1 eor v4.16b,v4.16b,v5.16b @@ -148,7 +148,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b @@ -229,17 +229,17 @@ aes_v8_encrypt: .Loop_enc: aese v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 subs w3,w3,#2 aese v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 b.gt .Loop_enc aese v2.16b,v0.16b - ld1 {v0.4s},[x2] aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] aese v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b @@ -258,17 +258,17 @@ aes_v8_decrypt: .Loop_dec: aesd v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 subs w3,w3,#2 aesd v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 b.gt .Loop_dec aesd v2.16b,v0.16b - ld1 {v0.4s},[x2] aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] aesd v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b @@ -292,13 +292,13 @@ aes_v8_cbc_encrypt: ld1 {v6.16b},[x4] ld1 {v0.16b},[x0],x8 - ld1 {v16.4s-v17.4s},[x3] // load key schedule... + ld1 {v16.4s,v17.4s},[x3] // load key schedule... sub w5,w5,#6 add x7,x3,x5,lsl#4 // pointer to last 7 round keys sub w5,w5,#2 - ld1 {v18.4s-v19.4s},[x7],#32 - ld1 {v20.4s-v21.4s},[x7],#32 - ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 ld1 {v7.4s},[x7] add x7,x3,#32 @@ -310,76 +310,99 @@ aes_v8_cbc_encrypt: eor v5.16b,v16.16b,v7.16b b.eq .Lcbc_enc128 + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b .Lenter_cbc_enc + +.align 4 .Loop_cbc_enc: aese v0.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesmc v0.16b,v0.16b - subs w6,w6,#2 + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc: aese v0.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesmc v0.16b,v0.16b - b.gt .Loop_cbc_enc + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq .Lcbc_enc192 aese v0.16b,v16.16b aesmc v0.16b,v0.16b - subs x2,x2,#16 + ld1 {v16.4s},[x14] aese v0.16b,v17.16b aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq + ld1 {v17.4s},[x3] + nop + +.Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq aese v0.16b,v18.16b aesmc v0.16b,v0.16b - add x7,x3,#16 aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 + ld1 {v16.16b},[x0],x8 aese v0.16b,v20.16b aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b + eor v16.16b,v16.16b,v5.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v0.16b,v23.16b - - mov w6,w5 eor v6.16b,v0.16b,v7.16b - st1 {v6.16b},[x1],#16 b.hs .Loop_cbc_enc + st1 {v6.16b},[x1],#16 b .Lcbc_done .align 5 .Lcbc_enc128: - ld1 {v2.4s-v3.4s},[x7] + ld1 {v2.4s,v3.4s},[x7] aese v0.16b,v16.16b aesmc v0.16b,v0.16b b .Lenter_cbc_enc128 .Loop_cbc_enc128: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 + st1 {v6.16b},[x1],#16 .Lenter_cbc_enc128: aese v0.16b,v17.16b aesmc v0.16b,v0.16b - subs x2,x2,#16 + subs x2,x2,#16 aese v0.16b,v2.16b aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq + csel x8,xzr,x8,eq aese v0.16b,v3.16b aesmc v0.16b,v0.16b aese v0.16b,v18.16b aesmc v0.16b,v0.16b aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 + ld1 {v16.16b},[x0],x8 aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b aese v0.16b,v22.16b aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b + eor v16.16b,v16.16b,v5.16b aese v0.16b,v23.16b eor v6.16b,v0.16b,v7.16b b.hs .Loop_cbc_enc128 @@ -404,81 +427,80 @@ aes_v8_cbc_encrypt: .Loop3x_cbc_dec: aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Loop3x_cbc_dec aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - eor v4.16b,v6.16b,v7.16b aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b - eor v5.16b,v2.16b,v7.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - eor v17.16b,v3.16b,v7.16b - subs x2,x2,#0x30 aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b - orr v6.16b,v19.16b,v19.16b - csel x6,x2,x6,lo // x6, w6, is zero at this point - aesd v0.16b,v20.16b - aesd v1.16b,v20.16b - aesd v18.16b,v20.16b - add x0,x0,x6 // x0 is adjusted in such way that + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that // at exit from the loop v1.16b-v18.16b // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b aesimc v18.16b,v18.16b - mov x7,x3 + ld1 {v2.16b},[x0],#16 aesd v0.16b,v21.16b - aesd v1.16b,v21.16b - aesd v18.16b,v21.16b - ld1 {v2.16b},[x0],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b aesimc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[x0],#16 aesd v0.16b,v22.16b - aesd v1.16b,v22.16b - aesd v18.16b,v22.16b - ld1 {v19.16b},[x0],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b aesimc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v19.16b},[x0],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b aesd v18.16b,v23.16b - - add w6,w5,#2 + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b eor v18.16b,v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - orr v0.16b,v2.16b,v2.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] st1 {v4.16b},[x1],#16 - orr v1.16b,v3.16b,v3.16b + orr v0.16b,v2.16b,v2.16b st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b st1 {v18.16b},[x1],#16 - orr v18.16b,v19.16b,v19.16b + orr v18.16b,v19.16b,v19.16b b.hs .Loop3x_cbc_dec cmn x2,#0x30 @@ -487,54 +509,54 @@ aes_v8_cbc_encrypt: .Lcbc_dec_tail: aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Lcbc_dec_tail aesd v1.16b,v16.16b - aesd v18.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b aesd v1.16b,v17.16b - aesd v18.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b aesd v1.16b,v20.16b - aesd v18.16b,v20.16b aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b aesimc v18.16b,v18.16b - cmn x2,#0x20 + cmn x2,#0x20 aesd v1.16b,v21.16b - aesd v18.16b,v21.16b aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b aesimc v18.16b,v18.16b - eor v5.16b,v6.16b,v7.16b + eor v5.16b,v6.16b,v7.16b aesd v1.16b,v22.16b - aesd v18.16b,v22.16b aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b aesimc v18.16b,v18.16b - eor v17.16b,v3.16b,v7.16b + eor v17.16b,v3.16b,v7.16b aesd v1.16b,v23.16b aesd v18.16b,v23.16b b.eq .Lcbc_dec_one eor v5.16b,v5.16b,v1.16b eor v17.16b,v17.16b,v18.16b - orr v6.16b,v19.16b,v19.16b + orr v6.16b,v19.16b,v19.16b st1 {v5.16b},[x1],#16 st1 {v17.16b},[x1],#16 b .Lcbc_done .Lcbc_dec_one: eor v5.16b,v5.16b,v18.16b - orr v6.16b,v19.16b,v19.16b + orr v6.16b,v19.16b,v19.16b st1 {v5.16b},[x1],#16 .Lcbc_done: @@ -547,181 +569,181 @@ aes_v8_cbc_encrypt: .type aes_v8_ctr32_encrypt_blocks,%function .align 5 aes_v8_ctr32_encrypt_blocks: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - ldr w5,[x3,#240] - - ldr w8, [x4, #12] - ld1 {v0.4s},[x4] - - ld1 {v16.4s-v17.4s},[x3] // load key schedule... - sub w5,w5,#4 - mov x12,#16 - cmp x2,#2 - add x7,x3,x5,lsl#4 // pointer to last 5 round keys - sub w5,w5,#2 - ld1 {v20.4s-v21.4s},[x7],#32 - ld1 {v22.4s-v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - add x7,x3,#32 - mov w6,w5 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 csel x12,xzr,x12,lo #ifndef __ARMEB__ - rev w8, w8 + rev w8, w8 #endif - orr v1.16b,v0.16b,v0.16b - add w10, w8, #1 - orr v18.16b,v0.16b,v0.16b - add w8, w8, #2 - orr v6.16b,v0.16b,v0.16b - rev w10, w10 - mov v1.s[3],w10 - b.ls .Lctr32_tail - rev w12, w8 - sub x2,x2,#3 // bias - mov v18.s[3],w12 - b .Loop3x_ctr32 + orr v1.16b,v0.16b,v0.16b + add w10, w8, #1 + orr v18.16b,v0.16b,v0.16b + add w8, w8, #2 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v1.s[3],w10 + b.ls .Lctr32_tail + rev w12, w8 + sub x2,x2,#3 // bias + mov v18.s[3],w12 + b .Loop3x_ctr32 .align 4 .Loop3x_ctr32: - aese v0.16b,v16.16b - aese v1.16b,v16.16b - aese v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aesmc v18.16b,v18.16b - subs w6,w6,#2 - aese v0.16b,v17.16b - aese v1.16b,v17.16b - aese v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aesmc v18.16b,v18.16b - b.gt .Loop3x_ctr32 - - aese v0.16b,v16.16b - aese v1.16b,v16.16b - aese v18.16b,v16.16b - mov x7,x3 - aesmc v4.16b,v0.16b - ld1 {v2.16b},[x0],#16 - aesmc v5.16b,v1.16b - aesmc v18.16b,v18.16b - orr v0.16b,v6.16b,v6.16b - aese v4.16b,v17.16b - ld1 {v3.16b},[x0],#16 - aese v5.16b,v17.16b - aese v18.16b,v17.16b - orr v1.16b,v6.16b,v6.16b - aesmc v4.16b,v4.16b - ld1 {v19.16b},[x0],#16 - aesmc v5.16b,v5.16b - aesmc v17.16b,v18.16b - orr v18.16b,v6.16b,v6.16b - add w9,w8,#1 - aese v4.16b,v20.16b - aese v5.16b,v20.16b - aese v17.16b,v20.16b - eor v2.16b,v2.16b,v7.16b - add w10,w8,#2 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b - aesmc v17.16b,v17.16b - eor v3.16b,v3.16b,v7.16b - add w8,w8,#3 - aese v4.16b,v21.16b - aese v5.16b,v21.16b - aese v17.16b,v21.16b - eor v19.16b,v19.16b,v7.16b - rev w9,w9 - aesmc v4.16b,v4.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - aesmc v5.16b,v5.16b - aesmc v17.16b,v17.16b - mov v0.s[3], w9 - rev w10,w10 - aese v4.16b,v22.16b - aese v5.16b,v22.16b - aese v17.16b,v22.16b - mov v1.s[3], w10 - rev w12,w8 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b - aesmc v17.16b,v17.16b - mov v18.s[3], w12 - subs x2,x2,#3 - aese v4.16b,v23.16b - aese v5.16b,v23.16b - aese v17.16b,v23.16b - - mov w6,w5 - eor v2.16b,v2.16b,v4.16b - eor v3.16b,v3.16b,v5.16b - eor v19.16b,v19.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v2.16b},[x1],#16 - st1 {v3.16b},[x1],#16 - st1 {v19.16b},[x1],#16 - b.hs .Loop3x_ctr32 - - adds x2,x2,#3 - b.eq .Lctr32_done - cmp x2,#1 - mov x12,#16 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + orr v0.16b,v6.16b,v6.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + orr v18.16b,v6.16b,v6.16b + add w9,w8,#1 + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + eor v19.16b,v19.16b,v7.16b + rev w9,w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + mov v0.s[3], w9 + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + mov v1.s[3], w10 + rev w12,w8 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + mov v18.s[3], w12 + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs .Loop3x_ctr32 + + adds x2,x2,#3 + b.eq .Lctr32_done + cmp x2,#1 + mov x12,#16 csel x12,xzr,x12,eq .Lctr32_tail: - aese v0.16b,v16.16b - aese v1.16b,v16.16b - ld1 {v16.4s},[x7],#16 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - subs w6,w6,#2 - aese v0.16b,v17.16b - aese v1.16b,v17.16b - ld1 {v17.4s},[x7],#16 - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - b.gt .Lctr32_tail - - aese v0.16b,v16.16b - aese v1.16b,v16.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v17.16b - aese v1.16b,v17.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - ld1 {v2.16b},[x0],x12 - aese v0.16b,v20.16b - aese v1.16b,v20.16b - ld1 {v3.16b},[x0] - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v21.16b - aese v1.16b,v21.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - aese v0.16b,v22.16b - aese v1.16b,v22.16b - eor v2.16b,v2.16b,v7.16b - aesmc v0.16b,v0.16b - aesmc v1.16b,v1.16b - eor v3.16b,v3.16b,v7.16b - aese v0.16b,v23.16b - aese v1.16b,v23.16b - - cmp x2,#1 - eor v2.16b,v2.16b,v0.16b - eor v3.16b,v3.16b,v1.16b - st1 {v2.16b},[x1],#16 - b.eq .Lctr32_done - st1 {v3.16b},[x1] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq .Lctr32_done + st1 {v3.16b},[x1] .Lctr32_done: - ldr x29,[sp],#16 + ldr x29,[sp],#16 ret .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks #endif diff --git a/linux-aarch64/crypto/modes/ghashv8-armx.S b/linux-aarch64/crypto/modes/ghashv8-armx.S index 565146e..ad19074 100644 --- a/linux-aarch64/crypto/modes/ghashv8-armx.S +++ b/linux-aarch64/crypto/modes/ghashv8-armx.S @@ -4,114 +4,227 @@ #if !defined(__clang__) .arch armv8-a+crypto #endif -.global gcm_init_v8 +.globl gcm_init_v8 .type gcm_init_v8,%function .align 4 gcm_init_v8: - ld1 {v17.2d},[x1] //load H - movi v16.16b,#0xe1 - ext v3.16b,v17.16b,v17.16b,#8 - shl v16.2d,v16.2d,#57 - ushr v18.2d,v16.2d,#63 - ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01 - dup v17.4s,v17.s[1] - ushr v19.2d,v3.2d,#63 + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 sshr v17.4s,v17.4s,#31 //broadcast carry bit - and v19.16b,v19.16b,v16.16b + and v18.16b,v18.16b,v16.16b shl v3.2d,v3.2d,#1 - ext v19.16b,v19.16b,v19.16b,#8 - and v16.16b,v16.16b,v17.16b - orr v3.16b,v3.16b,v19.16b //H<<<=1 - eor v3.16b,v3.16b,v16.16b //twisted H - st1 {v3.2d},[x0] + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] ret .size gcm_init_v8,.-gcm_init_v8 - -.global gcm_gmult_v8 +.globl gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: - ld1 {v17.2d},[x0] //load Xi - movi v19.16b,#0xe1 - ld1 {v20.2d},[x1] //load twisted H + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... shl v19.2d,v19.2d,#57 #ifndef __ARMEB__ rev64 v17.16b,v17.16b #endif - ext v21.16b,v20.16b,v20.16b,#8 - mov x3,#0 - ext v3.16b,v17.16b,v17.16b,#8 - mov x12,#0 - eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing - mov x2,x0 - b .Lgmult_v8 -.size gcm_gmult_v8,.-gcm_gmult_v8 + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b -.global gcm_ghash_v8 +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_gmult_v8,.-gcm_gmult_v8 +.globl gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: - ld1 {v0.2d},[x0] //load [rotated] Xi - subs x3,x3,#16 - movi v19.16b,#0xe1 - mov x12,#16 - ld1 {v20.2d},[x1] //load twisted H - csel x12,xzr,x12,eq - ext v0.16b,v0.16b,v0.16b,#8 - shl v19.2d,v19.2d,#57 - ld1 {v17.2d},[x2],x12 //load [rotated] inp - ext v21.16b,v20.16b,v20.16b,#8 + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //alorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude oversteping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant #ifndef __ARMEB__ + rev64 v16.16b,v16.16b rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo .Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __ARMEB__ rev64 v17.16b,v17.16b #endif - eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing - ext v3.16b,v17.16b,v17.16b,#8 - b .Loop_v8 + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b .Loop_mod2x_v8 .align 4 -.Loop_v8: - ext v18.16b,v0.16b,v0.16b,#8 - eor v3.16b,v3.16b,v0.16b //inp^=Xi - eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi +.Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __ARMEB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs .Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq .Ldone_v8 //is x3 zero? +.Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi -.Lgmult_v8: pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo - eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - subs x3,x3,#16 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) - csel x12,xzr,x12,eq - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v17.2d},[x2],x12 //load [rotated] inp - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] -#ifndef __ARMEB__ - rev64 v17.16b,v17.16b -#endif - eor v0.16b,v1.16b,v18.16b - ext v3.16b,v17.16b,v17.16b,#8 + eor v0.16b,v1.16b,v18.16b - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - b.hs .Loop_v8 + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b +.Ldone_v8: #ifndef __ARMEB__ rev64 v0.16b,v0.16b #endif - ext v0.16b,v0.16b,v0.16b,#8 - st1 {v0.2d},[x0] //write out Xi + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi ret .size gcm_ghash_v8,.-gcm_ghash_v8 -.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>" -.align 2 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 diff --git a/linux-aarch64/crypto/sha/sha1-armv8.S b/linux-aarch64/crypto/sha/sha1-armv8.S index f9d1262..ab6aa98 100644 --- a/linux-aarch64/crypto/sha/sha1-armv8.S +++ b/linux-aarch64/crypto/sha/sha1-armv8.S @@ -2,6 +2,7 @@ .text + .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 @@ -213,826 +214,826 @@ sha1_block_data_order: add w20,w20,w17 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) lsr x19,x17,#32 - eor w3,w3,w5 + eor w3,w3,w5 bic w25,w24,w22 and w26,w23,w22 ror w27,w21,#27 - eor w3,w3,w11 + eor w3,w3,w11 add w24,w24,w28 // future e+=K orr w25,w25,w26 add w20,w20,w27 // e+=rot(a,5) - eor w3,w3,w16 + eor w3,w3,w16 ror w22,w22,#2 add w24,w24,w19 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 + ror w3,w3,#31 + eor w4,w4,w6 bic w25,w23,w21 and w26,w22,w21 ror w27,w20,#27 - eor w4,w4,w12 + eor w4,w4,w12 add w23,w23,w28 // future e+=K orr w25,w25,w26 add w24,w24,w27 // e+=rot(a,5) - eor w4,w4,w17 + eor w4,w4,w17 ror w21,w21,#2 add w23,w23,w3 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 + ror w4,w4,#31 + eor w5,w5,w7 bic w25,w22,w20 and w26,w21,w20 ror w27,w24,#27 - eor w5,w5,w13 + eor w5,w5,w13 add w22,w22,w28 // future e+=K orr w25,w25,w26 add w23,w23,w27 // e+=rot(a,5) - eor w5,w5,w19 + eor w5,w5,w19 ror w20,w20,#2 add w22,w22,w4 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 + ror w5,w5,#31 + eor w6,w6,w8 bic w25,w21,w24 and w26,w20,w24 ror w27,w23,#27 - eor w6,w6,w14 + eor w6,w6,w14 add w21,w21,w28 // future e+=K orr w25,w25,w26 add w22,w22,w27 // e+=rot(a,5) - eor w6,w6,w3 + eor w6,w6,w3 ror w24,w24,#2 add w21,w21,w5 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 + ror w6,w6,#31 + eor w7,w7,w9 bic w25,w20,w23 and w26,w24,w23 ror w27,w22,#27 - eor w7,w7,w15 + eor w7,w7,w15 add w20,w20,w28 // future e+=K orr w25,w25,w26 add w21,w21,w27 // e+=rot(a,5) - eor w7,w7,w4 + eor w7,w7,w4 ror w23,w23,#2 add w20,w20,w6 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w7,w7,#31 + ror w7,w7,#31 movz w28,#0xeba1 movk w28,#0x6ed9,lsl#16 - eor w8,w8,w10 + eor w8,w8,w10 bic w25,w24,w22 and w26,w23,w22 ror w27,w21,#27 - eor w8,w8,w16 + eor w8,w8,w16 add w24,w24,w28 // future e+=K orr w25,w25,w26 add w20,w20,w27 // e+=rot(a,5) - eor w8,w8,w5 + eor w8,w8,w5 ror w22,w22,#2 add w24,w24,w7 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 + ror w8,w8,#31 + eor w9,w9,w11 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w9,w9,w17 + eor w9,w9,w17 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w9,w9,w6 + eor w9,w9,w6 add w23,w23,w8 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 + ror w9,w9,#31 + eor w10,w10,w12 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w10,w10,w19 + eor w10,w10,w19 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w10,w10,w7 + eor w10,w10,w7 add w22,w22,w9 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 + ror w10,w10,#31 + eor w11,w11,w13 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w11,w11,w3 + eor w11,w11,w3 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w11,w11,w8 + eor w11,w11,w8 add w21,w21,w10 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w11,w11,#31 - eor w12,w12,w14 + ror w11,w11,#31 + eor w12,w12,w14 eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K - eor w12,w12,w4 + eor w12,w12,w4 eor w25,w25,w24 add w21,w21,w27 // e+=rot(a,5) ror w23,w23,#2 - eor w12,w12,w9 + eor w12,w12,w9 add w20,w20,w11 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w12,w12,#31 - eor w13,w13,w15 + ror w12,w12,#31 + eor w13,w13,w15 eor w25,w24,w22 ror w27,w21,#27 add w24,w24,w28 // future e+=K - eor w13,w13,w5 + eor w13,w13,w5 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - eor w13,w13,w10 + eor w13,w13,w10 add w24,w24,w12 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w13,w13,#31 - eor w14,w14,w16 + ror w13,w13,#31 + eor w14,w14,w16 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w14,w14,w6 + eor w14,w14,w6 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w14,w14,w11 + eor w14,w14,w11 add w23,w23,w13 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w14,w14,#31 - eor w15,w15,w17 + ror w14,w14,#31 + eor w15,w15,w17 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w15,w15,w7 + eor w15,w15,w7 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w15,w15,w12 + eor w15,w15,w12 add w22,w22,w14 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w15,w15,#31 - eor w16,w16,w19 + ror w15,w15,#31 + eor w16,w16,w19 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w16,w16,w8 + eor w16,w16,w8 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w16,w16,w13 + eor w16,w16,w13 add w21,w21,w15 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 + ror w16,w16,#31 + eor w17,w17,w3 eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K - eor w17,w17,w9 + eor w17,w17,w9 eor w25,w25,w24 add w21,w21,w27 // e+=rot(a,5) ror w23,w23,#2 - eor w17,w17,w14 + eor w17,w17,w14 add w20,w20,w16 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 + ror w17,w17,#31 + eor w19,w19,w4 eor w25,w24,w22 ror w27,w21,#27 add w24,w24,w28 // future e+=K - eor w19,w19,w10 + eor w19,w19,w10 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - eor w19,w19,w15 + eor w19,w19,w15 add w24,w24,w17 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w19,w19,#31 - eor w3,w3,w5 + ror w19,w19,#31 + eor w3,w3,w5 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w3,w3,w11 + eor w3,w3,w11 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w3,w3,w16 + eor w3,w3,w16 add w23,w23,w19 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 + ror w3,w3,#31 + eor w4,w4,w6 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w4,w4,w12 + eor w4,w4,w12 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w4,w4,w17 + eor w4,w4,w17 add w22,w22,w3 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 + ror w4,w4,#31 + eor w5,w5,w7 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w5,w5,w13 + eor w5,w5,w13 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w5,w5,w19 + eor w5,w5,w19 add w21,w21,w4 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 + ror w5,w5,#31 + eor w6,w6,w8 eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K - eor w6,w6,w14 + eor w6,w6,w14 eor w25,w25,w24 add w21,w21,w27 // e+=rot(a,5) ror w23,w23,#2 - eor w6,w6,w3 + eor w6,w6,w3 add w20,w20,w5 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 + ror w6,w6,#31 + eor w7,w7,w9 eor w25,w24,w22 ror w27,w21,#27 add w24,w24,w28 // future e+=K - eor w7,w7,w15 + eor w7,w7,w15 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - eor w7,w7,w4 + eor w7,w7,w4 add w24,w24,w6 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w7,w7,#31 - eor w8,w8,w10 + ror w7,w7,#31 + eor w8,w8,w10 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w8,w8,w16 + eor w8,w8,w16 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w8,w8,w5 + eor w8,w8,w5 add w23,w23,w7 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 + ror w8,w8,#31 + eor w9,w9,w11 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w9,w9,w17 + eor w9,w9,w17 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w9,w9,w6 + eor w9,w9,w6 add w22,w22,w8 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 + ror w9,w9,#31 + eor w10,w10,w12 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w10,w10,w19 + eor w10,w10,w19 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w10,w10,w7 + eor w10,w10,w7 add w21,w21,w9 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 + ror w10,w10,#31 + eor w11,w11,w13 eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K - eor w11,w11,w3 + eor w11,w11,w3 eor w25,w25,w24 add w21,w21,w27 // e+=rot(a,5) ror w23,w23,#2 - eor w11,w11,w8 + eor w11,w11,w8 add w20,w20,w10 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w11,w11,#31 + ror w11,w11,#31 movz w28,#0xbcdc movk w28,#0x8f1b,lsl#16 - eor w12,w12,w14 + eor w12,w12,w14 eor w25,w24,w22 ror w27,w21,#27 add w24,w24,w28 // future e+=K - eor w12,w12,w4 + eor w12,w12,w4 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - eor w12,w12,w9 + eor w12,w12,w9 add w24,w24,w11 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w12,w12,#31 + ror w12,w12,#31 orr w25,w21,w22 and w26,w21,w22 - eor w13,w13,w15 + eor w13,w13,w15 ror w27,w20,#27 and w25,w25,w23 add w23,w23,w28 // future e+=K - eor w13,w13,w5 + eor w13,w13,w5 add w24,w24,w27 // e+=rot(a,5) orr w25,w25,w26 ror w21,w21,#2 - eor w13,w13,w10 + eor w13,w13,w10 add w23,w23,w12 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w13,w13,#31 + ror w13,w13,#31 orr w25,w20,w21 and w26,w20,w21 - eor w14,w14,w16 + eor w14,w14,w16 ror w27,w24,#27 and w25,w25,w22 add w22,w22,w28 // future e+=K - eor w14,w14,w6 + eor w14,w14,w6 add w23,w23,w27 // e+=rot(a,5) orr w25,w25,w26 ror w20,w20,#2 - eor w14,w14,w11 + eor w14,w14,w11 add w22,w22,w13 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w14,w14,#31 + ror w14,w14,#31 orr w25,w24,w20 and w26,w24,w20 - eor w15,w15,w17 + eor w15,w15,w17 ror w27,w23,#27 and w25,w25,w21 add w21,w21,w28 // future e+=K - eor w15,w15,w7 + eor w15,w15,w7 add w22,w22,w27 // e+=rot(a,5) orr w25,w25,w26 ror w24,w24,#2 - eor w15,w15,w12 + eor w15,w15,w12 add w21,w21,w14 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w15,w15,#31 + ror w15,w15,#31 orr w25,w23,w24 and w26,w23,w24 - eor w16,w16,w19 + eor w16,w16,w19 ror w27,w22,#27 and w25,w25,w20 add w20,w20,w28 // future e+=K - eor w16,w16,w8 + eor w16,w16,w8 add w21,w21,w27 // e+=rot(a,5) orr w25,w25,w26 ror w23,w23,#2 - eor w16,w16,w13 + eor w16,w16,w13 add w20,w20,w15 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w16,w16,#31 + ror w16,w16,#31 orr w25,w22,w23 and w26,w22,w23 - eor w17,w17,w3 + eor w17,w17,w3 ror w27,w21,#27 and w25,w25,w24 add w24,w24,w28 // future e+=K - eor w17,w17,w9 + eor w17,w17,w9 add w20,w20,w27 // e+=rot(a,5) orr w25,w25,w26 ror w22,w22,#2 - eor w17,w17,w14 + eor w17,w17,w14 add w24,w24,w16 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w17,w17,#31 + ror w17,w17,#31 orr w25,w21,w22 and w26,w21,w22 - eor w19,w19,w4 + eor w19,w19,w4 ror w27,w20,#27 and w25,w25,w23 add w23,w23,w28 // future e+=K - eor w19,w19,w10 + eor w19,w19,w10 add w24,w24,w27 // e+=rot(a,5) orr w25,w25,w26 ror w21,w21,#2 - eor w19,w19,w15 + eor w19,w19,w15 add w23,w23,w17 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w19,w19,#31 + ror w19,w19,#31 orr w25,w20,w21 and w26,w20,w21 - eor w3,w3,w5 + eor w3,w3,w5 ror w27,w24,#27 and w25,w25,w22 add w22,w22,w28 // future e+=K - eor w3,w3,w11 + eor w3,w3,w11 add w23,w23,w27 // e+=rot(a,5) orr w25,w25,w26 ror w20,w20,#2 - eor w3,w3,w16 + eor w3,w3,w16 add w22,w22,w19 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w3,w3,#31 + ror w3,w3,#31 orr w25,w24,w20 and w26,w24,w20 - eor w4,w4,w6 + eor w4,w4,w6 ror w27,w23,#27 and w25,w25,w21 add w21,w21,w28 // future e+=K - eor w4,w4,w12 + eor w4,w4,w12 add w22,w22,w27 // e+=rot(a,5) orr w25,w25,w26 ror w24,w24,#2 - eor w4,w4,w17 + eor w4,w4,w17 add w21,w21,w3 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w4,w4,#31 + ror w4,w4,#31 orr w25,w23,w24 and w26,w23,w24 - eor w5,w5,w7 + eor w5,w5,w7 ror w27,w22,#27 and w25,w25,w20 add w20,w20,w28 // future e+=K - eor w5,w5,w13 + eor w5,w5,w13 add w21,w21,w27 // e+=rot(a,5) orr w25,w25,w26 ror w23,w23,#2 - eor w5,w5,w19 + eor w5,w5,w19 add w20,w20,w4 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w5,w5,#31 + ror w5,w5,#31 orr w25,w22,w23 and w26,w22,w23 - eor w6,w6,w8 + eor w6,w6,w8 ror w27,w21,#27 and w25,w25,w24 add w24,w24,w28 // future e+=K - eor w6,w6,w14 + eor w6,w6,w14 add w20,w20,w27 // e+=rot(a,5) orr w25,w25,w26 ror w22,w22,#2 - eor w6,w6,w3 + eor w6,w6,w3 add w24,w24,w5 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w6,w6,#31 + ror w6,w6,#31 orr w25,w21,w22 and w26,w21,w22 - eor w7,w7,w9 + eor w7,w7,w9 ror w27,w20,#27 and w25,w25,w23 add w23,w23,w28 // future e+=K - eor w7,w7,w15 + eor w7,w7,w15 add w24,w24,w27 // e+=rot(a,5) orr w25,w25,w26 ror w21,w21,#2 - eor w7,w7,w4 + eor w7,w7,w4 add w23,w23,w6 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w7,w7,#31 + ror w7,w7,#31 orr w25,w20,w21 and w26,w20,w21 - eor w8,w8,w10 + eor w8,w8,w10 ror w27,w24,#27 and w25,w25,w22 add w22,w22,w28 // future e+=K - eor w8,w8,w16 + eor w8,w8,w16 add w23,w23,w27 // e+=rot(a,5) orr w25,w25,w26 ror w20,w20,#2 - eor w8,w8,w5 + eor w8,w8,w5 add w22,w22,w7 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w8,w8,#31 + ror w8,w8,#31 orr w25,w24,w20 and w26,w24,w20 - eor w9,w9,w11 + eor w9,w9,w11 ror w27,w23,#27 and w25,w25,w21 add w21,w21,w28 // future e+=K - eor w9,w9,w17 + eor w9,w9,w17 add w22,w22,w27 // e+=rot(a,5) orr w25,w25,w26 ror w24,w24,#2 - eor w9,w9,w6 + eor w9,w9,w6 add w21,w21,w8 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w9,w9,#31 + ror w9,w9,#31 orr w25,w23,w24 and w26,w23,w24 - eor w10,w10,w12 + eor w10,w10,w12 ror w27,w22,#27 and w25,w25,w20 add w20,w20,w28 // future e+=K - eor w10,w10,w19 + eor w10,w10,w19 add w21,w21,w27 // e+=rot(a,5) orr w25,w25,w26 ror w23,w23,#2 - eor w10,w10,w7 + eor w10,w10,w7 add w20,w20,w9 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w10,w10,#31 + ror w10,w10,#31 orr w25,w22,w23 and w26,w22,w23 - eor w11,w11,w13 + eor w11,w11,w13 ror w27,w21,#27 and w25,w25,w24 add w24,w24,w28 // future e+=K - eor w11,w11,w3 + eor w11,w11,w3 add w20,w20,w27 // e+=rot(a,5) orr w25,w25,w26 ror w22,w22,#2 - eor w11,w11,w8 + eor w11,w11,w8 add w24,w24,w10 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w11,w11,#31 + ror w11,w11,#31 orr w25,w21,w22 and w26,w21,w22 - eor w12,w12,w14 + eor w12,w12,w14 ror w27,w20,#27 and w25,w25,w23 add w23,w23,w28 // future e+=K - eor w12,w12,w4 + eor w12,w12,w4 add w24,w24,w27 // e+=rot(a,5) orr w25,w25,w26 ror w21,w21,#2 - eor w12,w12,w9 + eor w12,w12,w9 add w23,w23,w11 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w12,w12,#31 + ror w12,w12,#31 orr w25,w20,w21 and w26,w20,w21 - eor w13,w13,w15 + eor w13,w13,w15 ror w27,w24,#27 and w25,w25,w22 add w22,w22,w28 // future e+=K - eor w13,w13,w5 + eor w13,w13,w5 add w23,w23,w27 // e+=rot(a,5) orr w25,w25,w26 ror w20,w20,#2 - eor w13,w13,w10 + eor w13,w13,w10 add w22,w22,w12 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w13,w13,#31 + ror w13,w13,#31 orr w25,w24,w20 and w26,w24,w20 - eor w14,w14,w16 + eor w14,w14,w16 ror w27,w23,#27 and w25,w25,w21 add w21,w21,w28 // future e+=K - eor w14,w14,w6 + eor w14,w14,w6 add w22,w22,w27 // e+=rot(a,5) orr w25,w25,w26 ror w24,w24,#2 - eor w14,w14,w11 + eor w14,w14,w11 add w21,w21,w13 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w14,w14,#31 + ror w14,w14,#31 orr w25,w23,w24 and w26,w23,w24 - eor w15,w15,w17 + eor w15,w15,w17 ror w27,w22,#27 and w25,w25,w20 add w20,w20,w28 // future e+=K - eor w15,w15,w7 + eor w15,w15,w7 add w21,w21,w27 // e+=rot(a,5) orr w25,w25,w26 ror w23,w23,#2 - eor w15,w15,w12 + eor w15,w15,w12 add w20,w20,w14 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w15,w15,#31 + ror w15,w15,#31 movz w28,#0xc1d6 movk w28,#0xca62,lsl#16 orr w25,w22,w23 and w26,w22,w23 - eor w16,w16,w19 + eor w16,w16,w19 ror w27,w21,#27 and w25,w25,w24 add w24,w24,w28 // future e+=K - eor w16,w16,w8 + eor w16,w16,w8 add w20,w20,w27 // e+=rot(a,5) orr w25,w25,w26 ror w22,w22,#2 - eor w16,w16,w13 + eor w16,w16,w13 add w24,w24,w15 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 + ror w16,w16,#31 + eor w17,w17,w3 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w17,w17,w9 + eor w17,w17,w9 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w17,w17,w14 + eor w17,w17,w14 add w23,w23,w16 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 + ror w17,w17,#31 + eor w19,w19,w4 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w19,w19,w10 + eor w19,w19,w10 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w19,w19,w15 + eor w19,w19,w15 add w22,w22,w17 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w19,w19,#31 - eor w3,w3,w5 + ror w19,w19,#31 + eor w3,w3,w5 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w3,w3,w11 + eor w3,w3,w11 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w3,w3,w16 + eor w3,w3,w16 add w21,w21,w19 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 + ror w3,w3,#31 + eor w4,w4,w6 eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K - eor w4,w4,w12 + eor w4,w4,w12 eor w25,w25,w24 add w21,w21,w27 // e+=rot(a,5) ror w23,w23,#2 - eor w4,w4,w17 + eor w4,w4,w17 add w20,w20,w3 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 + ror w4,w4,#31 + eor w5,w5,w7 eor w25,w24,w22 ror w27,w21,#27 add w24,w24,w28 // future e+=K - eor w5,w5,w13 + eor w5,w5,w13 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - eor w5,w5,w19 + eor w5,w5,w19 add w24,w24,w4 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 + ror w5,w5,#31 + eor w6,w6,w8 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w6,w6,w14 + eor w6,w6,w14 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w6,w6,w3 + eor w6,w6,w3 add w23,w23,w5 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 + ror w6,w6,#31 + eor w7,w7,w9 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w7,w7,w15 + eor w7,w7,w15 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w7,w7,w4 + eor w7,w7,w4 add w22,w22,w6 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w7,w7,#31 - eor w8,w8,w10 + ror w7,w7,#31 + eor w8,w8,w10 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w8,w8,w16 + eor w8,w8,w16 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w8,w8,w5 + eor w8,w8,w5 add w21,w21,w7 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 + ror w8,w8,#31 + eor w9,w9,w11 eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K - eor w9,w9,w17 + eor w9,w9,w17 eor w25,w25,w24 add w21,w21,w27 // e+=rot(a,5) ror w23,w23,#2 - eor w9,w9,w6 + eor w9,w9,w6 add w20,w20,w8 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 + ror w9,w9,#31 + eor w10,w10,w12 eor w25,w24,w22 ror w27,w21,#27 add w24,w24,w28 // future e+=K - eor w10,w10,w19 + eor w10,w10,w19 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - eor w10,w10,w7 + eor w10,w10,w7 add w24,w24,w9 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 + ror w10,w10,#31 + eor w11,w11,w13 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w11,w11,w3 + eor w11,w11,w3 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w11,w11,w8 + eor w11,w11,w8 add w23,w23,w10 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w11,w11,#31 - eor w12,w12,w14 + ror w11,w11,#31 + eor w12,w12,w14 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w12,w12,w4 + eor w12,w12,w4 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w12,w12,w9 + eor w12,w12,w9 add w22,w22,w11 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w12,w12,#31 - eor w13,w13,w15 + ror w12,w12,#31 + eor w13,w13,w15 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w13,w13,w5 + eor w13,w13,w5 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w13,w13,w10 + eor w13,w13,w10 add w21,w21,w12 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w13,w13,#31 - eor w14,w14,w16 + ror w13,w13,#31 + eor w14,w14,w16 eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K - eor w14,w14,w6 + eor w14,w14,w6 eor w25,w25,w24 add w21,w21,w27 // e+=rot(a,5) ror w23,w23,#2 - eor w14,w14,w11 + eor w14,w14,w11 add w20,w20,w13 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ror w14,w14,#31 - eor w15,w15,w17 + ror w14,w14,#31 + eor w15,w15,w17 eor w25,w24,w22 ror w27,w21,#27 add w24,w24,w28 // future e+=K - eor w15,w15,w7 + eor w15,w15,w7 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - eor w15,w15,w12 + eor w15,w15,w12 add w24,w24,w14 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) - ror w15,w15,#31 - eor w16,w16,w19 + ror w15,w15,#31 + eor w16,w16,w19 eor w25,w23,w21 ror w27,w20,#27 add w23,w23,w28 // future e+=K - eor w16,w16,w8 + eor w16,w16,w8 eor w25,w25,w22 add w24,w24,w27 // e+=rot(a,5) ror w21,w21,#2 - eor w16,w16,w13 + eor w16,w16,w13 add w23,w23,w15 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 + ror w16,w16,#31 + eor w17,w17,w3 eor w25,w22,w20 ror w27,w24,#27 add w22,w22,w28 // future e+=K - eor w17,w17,w9 + eor w17,w17,w9 eor w25,w25,w21 add w23,w23,w27 // e+=rot(a,5) ror w20,w20,#2 - eor w17,w17,w14 + eor w17,w17,w14 add w22,w22,w16 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 + ror w17,w17,#31 + eor w19,w19,w4 eor w25,w21,w24 ror w27,w23,#27 add w21,w21,w28 // future e+=K - eor w19,w19,w10 + eor w19,w19,w10 eor w25,w25,w20 add w22,w22,w27 // e+=rot(a,5) ror w24,w24,#2 - eor w19,w19,w15 + eor w19,w19,w15 add w21,w21,w17 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) - ror w19,w19,#31 + ror w19,w19,#31 ldp w4,w5,[x0] eor w25,w20,w23 ror w27,w22,#27 @@ -1080,10 +1081,10 @@ sha1_block_armv8: ld1 {v0.4s},[x0],#16 ld1 {v1.s}[0],[x0] sub x0,x0,#16 - ld1 {v16.4s-v19.4s},[x4] + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] .Loop_hw: - ld1 {v4.16b-v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 sub x2,x2,#1 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b @@ -1094,98 +1095,98 @@ sha1_block_armv8: add v21.4s,v16.4s,v5.4s rev32 v7.16b,v7.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b - .inst 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 +.inst 0x5e280803 //sha1h v3.16b,v0.16b +.inst 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 add v20.4s,v16.4s,v6.4s - .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 1 - .inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 1 +.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s add v21.4s,v16.4s,v7.4s - .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b - .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 2 - .inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 2 +.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s add v20.4s,v16.4s,v4.4s - .inst 0x5e281885 //sha1su1 v5.16b,v4.16b - .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 3 - .inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 3 +.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s add v21.4s,v17.4s,v5.4s - .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b - .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 4 - .inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 4 +.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s add v20.4s,v17.4s,v6.4s - .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b - .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 5 - .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 5 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s add v21.4s,v17.4s,v7.4s - .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b - .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 6 - .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 6 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s add v20.4s,v17.4s,v4.4s - .inst 0x5e281885 //sha1su1 v5.16b,v4.16b - .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 7 - .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 7 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s add v21.4s,v17.4s,v5.4s - .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b - .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 8 - .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 8 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s add v20.4s,v18.4s,v6.4s - .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b - .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 9 - .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 9 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s add v21.4s,v18.4s,v7.4s - .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b - .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 10 - .inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 10 +.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s add v20.4s,v18.4s,v4.4s - .inst 0x5e281885 //sha1su1 v5.16b,v4.16b - .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 11 - .inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 11 +.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s add v21.4s,v18.4s,v5.4s - .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b - .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 12 - .inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 12 +.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s add v20.4s,v18.4s,v6.4s - .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b - .inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 13 - .inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 13 +.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s add v21.4s,v19.4s,v7.4s - .inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b - .inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 14 - .inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 14 +.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s add v20.4s,v19.4s,v4.4s - .inst 0x5e281885 //sha1su1 v5.16b,v4.16b - .inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 15 - .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 15 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s add v21.4s,v19.4s,v5.4s - .inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b - .inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 16 - .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 16 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s add v20.4s,v19.4s,v6.4s - .inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 17 - .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 17 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s add v21.4s,v19.4s,v7.4s - .inst 0x5e280803 //sha1h v3.16b,v0.16b // 18 - .inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 18 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - .inst 0x5e280802 //sha1h v2.16b,v0.16b // 19 - .inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 19 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s add v1.4s,v1.4s,v2.4s add v0.4s,v0.4s,v22.4s @@ -1206,6 +1207,7 @@ sha1_block_armv8: .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 .LOPENSSL_armcap_P: .quad OPENSSL_armcap_P-. -.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 .align 2 .comm OPENSSL_armcap_P,4,4 diff --git a/linux-aarch64/crypto/sha/sha256-armv8.S b/linux-aarch64/crypto/sha/sha256-armv8.S index bd43b1f..ec572e9 100644 --- a/linux-aarch64/crypto/sha/sha256-armv8.S +++ b/linux-aarch64/crypto/sha/sha256-armv8.S @@ -2,6 +2,7 @@ .text + .globl sha256_block_data_order .type sha256_block_data_order,%function .align 6 @@ -27,7 +28,7 @@ sha256_block_data_order: ldp w24,w25,[x0,#4*4] add x2,x1,x2,lsl#6 // end of input ldp w26,w27,[x0,#6*4] - adr x30,K256 + adr x30,.LK256 stp x0,x2,[x29,#96] .Loop: @@ -975,167 +976,168 @@ sha256_block_data_order: .size sha256_block_data_order,.-sha256_block_data_order .align 6 -.type K256,%object -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0 //terminator -.size K256,.-K256 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 .align 3 .LOPENSSL_armcap_P: - .quad OPENSSL_armcap_P-. -.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.quad OPENSSL_armcap_P-. +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 .align 2 .type sha256_block_armv8,%function .align 6 sha256_block_armv8: .Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 - ld1 {v0.4s,v1.4s},[x0] - adr x3,K256 + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 .Loop_hw: - ld1 {v4.16b-v7.16b},[x1],#64 - sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - rev32 v6.16b,v6.16b - rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload - orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s - .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s - .inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s - .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s - .inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s - .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s - .inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - ld1 {v17.4s},[x3] - add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind - orr v2.16b,v0.16b,v0.16b - .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - add v17.4s,v17.4s,v7.4s - orr v2.16b,v0.16b,v0.16b - .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - add v0.4s,v0.4s,v18.4s - add v1.4s,v1.4s,v19.4s + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s - cbnz x2,.Loop_hw + cbnz x2,.Loop_hw - st1 {v0.4s,v1.4s},[x0] + st1 {v0.4s,v1.4s},[x0] - ldr x29,[sp],#16 + ldr x29,[sp],#16 ret .size sha256_block_armv8,.-sha256_block_armv8 .comm OPENSSL_armcap_P,4,4 diff --git a/linux-aarch64/crypto/sha/sha512-armv8.S b/linux-aarch64/crypto/sha/sha512-armv8.S index 6b0d194..8fc342a 100644 --- a/linux-aarch64/crypto/sha/sha512-armv8.S +++ b/linux-aarch64/crypto/sha/sha512-armv8.S @@ -2,6 +2,7 @@ .text + .globl sha512_block_data_order .type sha512_block_data_order,%function .align 6 @@ -21,7 +22,7 @@ sha512_block_data_order: ldp x24,x25,[x0,#4*8] add x2,x1,x2,lsl#7 // end of input ldp x26,x27,[x0,#6*8] - adr x30,K512 + adr x30,.LK512 stp x0,x2,[x29,#96] .Loop: @@ -969,53 +970,54 @@ sha512_block_data_order: .size sha512_block_data_order,.-sha512_block_data_order .align 6 -.type K512,%object -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - .quad 0 // terminator -.size K512,.-K512 +.type .LK512,%object +.LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator +.size .LK512,.-.LK512 .align 3 .LOPENSSL_armcap_P: - .quad OPENSSL_armcap_P-. -.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.quad OPENSSL_armcap_P-. +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 .align 2 .comm OPENSSL_armcap_P,4,4 |