summaryrefslogtreecommitdiffstats
path: root/linux-aarch64/crypto/aes/aesv8-armx.S
diff options
context:
space:
mode:
Diffstat (limited to 'linux-aarch64/crypto/aes/aesv8-armx.S')
-rw-r--r--linux-aarch64/crypto/aes/aesv8-armx.S520
1 files changed, 271 insertions, 249 deletions
diff --git a/linux-aarch64/crypto/aes/aesv8-armx.S b/linux-aarch64/crypto/aes/aesv8-armx.S
index e7ae46f..9c63291 100644
--- a/linux-aarch64/crypto/aes/aesv8-armx.S
+++ b/linux-aarch64/crypto/aes/aesv8-armx.S
@@ -6,7 +6,7 @@
.arch armv8-a+crypto
#endif
.align 5
-rcon:
+.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
@@ -31,7 +31,7 @@ aes_v8_set_encrypt_key:
tst w1,#0x3f
b.ne .Lenc_key_abort
- adr x3,rcon
+ adr x3,.Lrcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
@@ -55,7 +55,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@@ -72,7 +72,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@@ -86,7 +86,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2]
@@ -117,7 +117,7 @@ aes_v8_set_encrypt_key:
dup v5.4s,v3.s[3]
eor v5.16b,v5.16b,v4.16b
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
ext v4.16b,v0.16b,v4.16b,#12
shl v1.16b,v1.16b,#1
eor v4.16b,v4.16b,v5.16b
@@ -148,7 +148,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@@ -229,17 +229,17 @@ aes_v8_encrypt:
.Loop_enc:
aese v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
b.gt .Loop_enc
aese v2.16b,v0.16b
- ld1 {v0.4s},[x2]
aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@@ -258,17 +258,17 @@ aes_v8_decrypt:
.Loop_dec:
aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
b.gt .Loop_dec
aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2]
aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@@ -292,13 +292,13 @@ aes_v8_cbc_encrypt:
ld1 {v6.16b},[x4]
ld1 {v0.16b},[x0],x8
- ld1 {v16.4s-v17.4s},[x3] // load key schedule...
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
- ld1 {v18.4s-v19.4s},[x7],#32
- ld1 {v20.4s-v21.4s},[x7],#32
- ld1 {v22.4s-v23.4s},[x7],#32
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
@@ -310,76 +310,99 @@ aes_v8_cbc_encrypt:
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
.Loop_cbc_enc:
aese v0.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
- subs w6,w6,#2
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc:
aese v0.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
- b.gt .Loop_cbc_enc
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq .Lcbc_enc192
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
- subs x2,x2,#16
+ ld1 {v16.4s},[x14]
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
+ ld1 {v17.4s},[x3]
+ nop
+
+.Lcbc_enc192:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
- add x7,x3,#16
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
-
- mov w6,w5
eor v6.16b,v0.16b,v7.16b
- st1 {v6.16b},[x1],#16
b.hs .Loop_cbc_enc
+ st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
.Lcbc_enc128:
- ld1 {v2.4s-v3.4s},[x7]
+ ld1 {v2.4s,v3.4s},[x7]
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
+ st1 {v6.16b},[x1],#16
.Lenter_cbc_enc128:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
- subs x2,x2,#16
+ subs x2,x2,#16
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
+ csel x8,xzr,x8,eq
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ eor v16.16b,v16.16b,v5.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc128
@@ -404,81 +427,80 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Loop3x_cbc_dec
aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- eor v4.16b,v6.16b,v7.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
- eor v5.16b,v2.16b,v7.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- eor v17.16b,v3.16b,v7.16b
- subs x2,x2,#0x30
aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
- csel x6,x2,x6,lo // x6, w6, is zero at this point
- aesd v0.16b,v20.16b
- aesd v1.16b,v20.16b
- aesd v18.16b,v20.16b
- add x0,x0,x6 // x0 is adjusted in such way that
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
- mov x7,x3
+ ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
- aesd v1.16b,v21.16b
- aesd v18.16b,v21.16b
- ld1 {v2.16b},[x0],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
- ld1 {v3.16b},[x0],#16
+ ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
- aesd v1.16b,v22.16b
- aesd v18.16b,v22.16b
- ld1 {v19.16b},[x0],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
-
- add w6,w5,#2
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- orr v0.16b,v2.16b,v2.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v4.16b},[x1],#16
- orr v1.16b,v3.16b,v3.16b
+ orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
- orr v18.16b,v19.16b,v19.16b
+ orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
cmn x2,#0x30
@@ -487,54 +509,54 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Lcbc_dec_tail
aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
- aesd v18.16b,v20.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
- cmn x2,#0x20
+ cmn x2,#0x20
aesd v1.16b,v21.16b
- aesd v18.16b,v21.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
- eor v5.16b,v6.16b,v7.16b
+ eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
- aesd v18.16b,v22.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
+ eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
b.eq .Lcbc_dec_one
eor v5.16b,v5.16b,v1.16b
eor v17.16b,v17.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+ orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
st1 {v17.16b},[x1],#16
b .Lcbc_done
.Lcbc_dec_one:
eor v5.16b,v5.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+ orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
.Lcbc_done:
@@ -547,181 +569,181 @@ aes_v8_cbc_encrypt:
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
aes_v8_ctr32_encrypt_blocks:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- ldr w5,[x3,#240]
-
- ldr w8, [x4, #12]
- ld1 {v0.4s},[x4]
-
- ld1 {v16.4s-v17.4s},[x3] // load key schedule...
- sub w5,w5,#4
- mov x12,#16
- cmp x2,#2
- add x7,x3,x5,lsl#4 // pointer to last 5 round keys
- sub w5,w5,#2
- ld1 {v20.4s-v21.4s},[x7],#32
- ld1 {v22.4s-v23.4s},[x7],#32
- ld1 {v7.4s},[x7]
- add x7,x3,#32
- mov w6,w5
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last 5 round keys
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
csel x12,xzr,x12,lo
#ifndef __ARMEB__
- rev w8, w8
+ rev w8, w8
#endif
- orr v1.16b,v0.16b,v0.16b
- add w10, w8, #1
- orr v18.16b,v0.16b,v0.16b
- add w8, w8, #2
- orr v6.16b,v0.16b,v0.16b
- rev w10, w10
- mov v1.s[3],w10
- b.ls .Lctr32_tail
- rev w12, w8
- sub x2,x2,#3 // bias
- mov v18.s[3],w12
- b .Loop3x_ctr32
+ orr v1.16b,v0.16b,v0.16b
+ add w10, w8, #1
+ orr v18.16b,v0.16b,v0.16b
+ add w8, w8, #2
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v1.s[3],w10
+ b.ls .Lctr32_tail
+ rev w12, w8
+ sub x2,x2,#3 // bias
+ mov v18.s[3],w12
+ b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
- aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aese v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aesmc v18.16b,v18.16b
- subs w6,w6,#2
- aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- aese v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aesmc v18.16b,v18.16b
- b.gt .Loop3x_ctr32
-
- aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aese v18.16b,v16.16b
- mov x7,x3
- aesmc v4.16b,v0.16b
- ld1 {v2.16b},[x0],#16
- aesmc v5.16b,v1.16b
- aesmc v18.16b,v18.16b
- orr v0.16b,v6.16b,v6.16b
- aese v4.16b,v17.16b
- ld1 {v3.16b},[x0],#16
- aese v5.16b,v17.16b
- aese v18.16b,v17.16b
- orr v1.16b,v6.16b,v6.16b
- aesmc v4.16b,v4.16b
- ld1 {v19.16b},[x0],#16
- aesmc v5.16b,v5.16b
- aesmc v17.16b,v18.16b
- orr v18.16b,v6.16b,v6.16b
- add w9,w8,#1
- aese v4.16b,v20.16b
- aese v5.16b,v20.16b
- aese v17.16b,v20.16b
- eor v2.16b,v2.16b,v7.16b
- add w10,w8,#2
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
- aesmc v17.16b,v17.16b
- eor v3.16b,v3.16b,v7.16b
- add w8,w8,#3
- aese v4.16b,v21.16b
- aese v5.16b,v21.16b
- aese v17.16b,v21.16b
- eor v19.16b,v19.16b,v7.16b
- rev w9,w9
- aesmc v4.16b,v4.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- aesmc v5.16b,v5.16b
- aesmc v17.16b,v17.16b
- mov v0.s[3], w9
- rev w10,w10
- aese v4.16b,v22.16b
- aese v5.16b,v22.16b
- aese v17.16b,v22.16b
- mov v1.s[3], w10
- rev w12,w8
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
- aesmc v17.16b,v17.16b
- mov v18.s[3], w12
- subs x2,x2,#3
- aese v4.16b,v23.16b
- aese v5.16b,v23.16b
- aese v17.16b,v23.16b
-
- mov w6,w5
- eor v2.16b,v2.16b,v4.16b
- eor v3.16b,v3.16b,v5.16b
- eor v19.16b,v19.16b,v17.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- st1 {v2.16b},[x1],#16
- st1 {v3.16b},[x1],#16
- st1 {v19.16b},[x1],#16
- b.hs .Loop3x_ctr32
-
- adds x2,x2,#3
- b.eq .Lctr32_done
- cmp x2,#1
- mov x12,#16
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ orr v0.16b,v6.16b,v6.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ orr v1.16b,v6.16b,v6.16b
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ orr v18.16b,v6.16b,v6.16b
+ add w9,w8,#1
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ eor v19.16b,v19.16b,v7.16b
+ rev w9,w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ mov v0.s[3], w9
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ mov v1.s[3], w10
+ rev w12,w8
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ mov v18.s[3], w12
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v19.16b},[x1],#16
+ b.hs .Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq .Lctr32_done
+ cmp x2,#1
+ mov x12,#16
csel x12,xzr,x12,eq
.Lctr32_tail:
- aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- ld1 {v16.4s},[x7],#16
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- subs w6,w6,#2
- aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- ld1 {v17.4s},[x7],#16
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- b.gt .Lctr32_tail
-
- aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- ld1 {v2.16b},[x0],x12
- aese v0.16b,v20.16b
- aese v1.16b,v20.16b
- ld1 {v3.16b},[x0]
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v21.16b
- aese v1.16b,v21.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v22.16b
- aese v1.16b,v22.16b
- eor v2.16b,v2.16b,v7.16b
- aesmc v0.16b,v0.16b
- aesmc v1.16b,v1.16b
- eor v3.16b,v3.16b,v7.16b
- aese v0.16b,v23.16b
- aese v1.16b,v23.16b
-
- cmp x2,#1
- eor v2.16b,v2.16b,v0.16b
- eor v3.16b,v3.16b,v1.16b
- st1 {v2.16b},[x1],#16
- b.eq .Lctr32_done
- st1 {v3.16b},[x1]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq .Lctr32_done
+ st1 {v3.16b},[x1]
.Lctr32_done:
- ldr x29,[sp],#16
+ ldr x29,[sp],#16
ret
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif